From 6043063d2b7156e05d9abc253035fcbcb5027a28 Mon Sep 17 00:00:00 2001 From: Johnathan Rhyne Date: Wed, 5 Feb 2025 10:21:10 -0700 Subject: [PATCH 01/10] double tests pass. TODO: Run single tests for slarfb0c2 --- SRC/CMakeLists.txt | 2 +- SRC/Makefile | 2 +- SRC/dlarfb0c2.f | 263 +++++++++++++++++++++++++++++++++++++++++++++ SRC/dorglq.f | 25 ++--- SRC/dorgql.f | 14 +-- SRC/dorgqr.f | 172 +++++++++++++++++------------ SRC/dorgrq.f | 17 +-- SRC/slarfb0c2.f | 263 +++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 648 insertions(+), 110 deletions(-) create mode 100644 SRC/dlarfb0c2.f create mode 100644 SRC/slarfb0c2.f diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index be426cecd4..88e503efea 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -307,7 +307,7 @@ set(DLASRC dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqp2rk.f dlaqp3rk.f dlaqsb.f dlaqsp.f dlaqsy.f dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f - dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f dlarf1f.f dlarf1l.f + dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f dlarf1f.f dlarf1l.f dlarfb0c2.f dlargv.f dlarmm.f dlarrv.f dlartv.f dlarz.f dlarzb.f dlarzt.f dlaswp.f dlasy2.f dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f diff --git a/SRC/Makefile b/SRC/Makefile index 0191626f0e..dc8f472092 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -339,7 +339,7 @@ DLASRC = \ dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqp2rk.o dlaqp3rk.o dlaqsb.o dlaqsp.o dlaqsy.o \ dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \ - dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o dlarf1f.o dlarf1l.o\ + dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o dlarf1f.o dlarf1l.o dlarfb0c2.o\ dlargv.o dlarmm.o dlarrv.o dlartv.o \ dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o \ dlasyf.o dlasyf_rook.o dlasyf_rk.o \ diff --git a/SRC/dlarfb0c2.f b/SRC/dlarfb0c2.f new file mode 100644 index 0000000000..cb7f9d3873 --- /dev/null +++ b/SRC/dlarfb0c2.f @@ -0,0 +1,263 @@ + SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, + $ LDV, T, LDT, C, LDC) + ! Scalar arguments + INTEGER M, N, K, LDV, LDC, LDT + ! Note: We should probably get rid of SIDE and TRANS as these flags + ! are not used as we are only using this routine inside + ! x{OR,UN}G{QR,LQ,QR,LQ}, which have values that never change + CHARACTER SIDE, TRANS, DIRECT, STOREV + ! Array arguments + DOUBLE PRECISION V(LDV,*), C(LDC,*), T(LDT,*) + ! Local scalars + LOGICAL QR, LQ, QL, DIRF, COLV + ! External subroutines + EXTERNAL DGEMM, DTRMM + ! External functions + LOGICAL LSAME + EXTERNAL LSAME + ! Parameters + DOUBLE PRECISION ONE, ZERO, NEG_ONE + PARAMETER(ONE=1.0D+0, ZERO = 0.0D+0, NEG_ONE = -1.0D+0) + + ! Beginning of executable statements + ! Convert our character flags to logical values + DIRF = LSAME(DIRECT,'F') + COLV = LSAME(STOREV,'C') + + ! Determine which of the 4 modes are using. + ! QR is when we store the reflectors column by column and have the + ! 'first' reflector stored in the first column + QR = DIRF.AND.COLV + + ! LQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the first row + LQ = DIRF.AND.(.NOT.COLV) + + ! QL is when we store the reflectors column by column and have the + ! 'first' reflector stored in the last column + QL = (.NOT.DIRF).AND.COLV + + ! RQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the last row + ! RQ = (.NOT.DIRF).AND.(.NOT.COLV) + ! Since we have exactly one of these 4 modes, we don't need to actually + ! store the value of RQ, instead we assume this is the case if we fail + ! the above 3 checks. + + IF (QR) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V1 ] and C = [ C1 ] + ! [ V2 ] [ C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ] + ! [ V2 ] [ C2 ] + ! = [ 0 ] - [ V1 ]T*V2'*C2 + ! [ C2 ] [ V2 ] + ! + ! = [ 0 ] - [ V1*T*V2'*C2 ] + ! [ C2 ] [ V2*T*V2'*C2 ] + ! + ! = [ V1*T*V2'*C2 ] + ! [ C2 - V2*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! C1 = V2'*C2 + ! + CALL DGEMM('Transpose', 'No Transpose', K, N, M - K, + $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, + $ C, LDC) + ! + ! C1 = T*C1 + ! + CALL DTRMM('Left', 'Upper', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, + $ C(K+1,1), LDC) + ! + ! C1 = -V1*C1 + ! + CALL DTRMM('Left', 'Lower', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V, LDV, C, LDC) + ELSE IF (LQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V1 V2 ] and C = [ C1 C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ]) + ! [ V2' ] + ! + ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ] + ! [ V2' ] + ! + ! = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ] + ! + ! = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! To achieve the same end result + ! + ! C1 = C2*V2' + ! + CALL DGEMM('No Transpose', 'Transpose', M, K, N-K, + $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, LDC) + ! + ! C1 = C1*T' + ! + CALL DTRMM('Right', 'Upper', 'Transpose', 'Non-unit', + $ M, K, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, + $ C(1,K+1), LDC) + ! + ! C1 = -C1*V1 + ! + CALL DTRMM('Right', 'Upper', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V, LDV, C, LDC) + ELSE IF (QL) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V2 ] and C = [ C2 ] + ! [ V1 ] [ C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ] + ! [ V1 ] [ 0 ] + ! + ! = [ C2 ] - [ V2 ]T*V2'*C2 + ! [ 0 ] [ V1 ] + ! + ! = [ C2 ] - [ V2*T*V2'*C2 ] + ! [ 0 ] [ V1*T*V2'*C2 ] + ! + ! = [ C2 - V2*T*V2'*C2 ] + ! [ - V1*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! C1 = V2'*C2 + ! + CALL DGEMM('Transpose', 'No Transpose', K, N, M-K, + $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC) + ! + ! C1 = T*C1 + ! + CALL DTRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C(M-K+1,1), LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC) + ! + ! C1 = -V1*C1 + ! + CALL DTRMM('Left', 'Upper', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC) + ELSE ! IF (RQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V2 V1] and C = [ C2 C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ] + ! + ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] + ! + ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! + ! To achieve the same end result + ! + ! C1 = C2*V2' + ! + CALL DGEMM('No Transpose', 'Transpose', M, K, N-K, + $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC) + ! + ! C1 = C1*T' + ! + CALL DTRMM('Right', 'Lower', 'Transpose', 'Non-unit', + $ M, K, ONE, T, LDT, C(1, N-K+1), LDC) + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC) + ! + ! C1 = -C1*V1 + ! + CALL DTRMM('Right', 'Lower', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC) + END IF + END SUBROUTINE diff --git a/SRC/dorglq.f b/SRC/dorglq.f index c41367ced4..eeb73e4b27 100644 --- a/SRC/dorglq.f +++ b/SRC/dorglq.f @@ -148,7 +148,7 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL DLARFB, DLARFT, DORGL2, XERBLA + EXTERNAL DLARFB0C2, DLARFT, DORGL2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -163,7 +163,8 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'DORGLQ', ' ', M, N, K, -1 ) - LWKOPT = MAX( 1, M )*NB + ! DLARFB0C2 means we only need a workspace for calls to dorgl2 + LWKOPT = MAX( 1, M ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -227,11 +228,6 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * Set A(kk+1:m,1:kk) to zero. * - DO 20 J = 1, KK - DO 10 I = KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE ELSE KK = 0 END IF @@ -259,26 +255,19 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * Apply H**T to A(i+ib:m,i:n) from the right * - CALL DLARFB( 'Right', 'Transpose', 'Forward', - $ 'Rowwise', - $ M-I-IB+1, N-I+1, IB, A( I, I ), LDA, WORK, - $ LDWORK, A( I+IB, I ), LDA, WORK( IB+1 ), - $ LDWORK ) + CALL DLARFB0C2('A', 'A', 'Forward', 'Rowwise', + $ M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, LDWORK, + $ A(I+IB,I), LDA) END IF * * Apply H**T to columns i:n of current block -* + CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), $ WORK, $ IINFO ) * * Set columns 1:i-1 of current block to zero * - DO 40 J = 1, I - 1 - DO 30 L = I, I + IB - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE 50 CONTINUE END IF * diff --git a/SRC/dorgql.f b/SRC/dorgql.f index f931f5a9c8..089dff77bb 100644 --- a/SRC/dorgql.f +++ b/SRC/dorgql.f @@ -149,7 +149,7 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) $ NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL DLARFB, DLARFT, DORG2L, XERBLA + EXTERNAL DLARFB0C2, DLARFT, DORG2L, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -265,10 +265,9 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left * - CALL DLARFB( 'Left', 'No transpose', 'Backward', - $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, - $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA, - $ WORK( IB+1 ), LDWORK ) + CALL DLARFB0C2('A', 'A', 'Backward', 'Columnwise', + $ M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), LDA, + $ WORK, LDWORK, A, LDA) END IF * * Apply H to rows 1:m-k+i+ib-1 of current block @@ -278,11 +277,6 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * Set rows m-k+i+ib:m of current block to zero * - DO 40 J = N - K + I, N - K + I + IB - 1 - DO 30 L = M - K + I + IB, M - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE 50 CONTINUE END IF * diff --git a/SRC/dorgqr.f b/SRC/dorgqr.f index fd88519871..5e5851ff44 100644 --- a/SRC/dorgqr.f +++ b/SRC/dorgqr.f @@ -1,3 +1,5 @@ +c +c *> \brief \b DORGQR * * =========== DOCUMENTATION =========== @@ -121,10 +123,11 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup ungqr +*> \ingroup doubleOTHERcomputational * * ===================================================================== SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) + IMPLICIT NONE * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -140,16 +143,16 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * ===================================================================== * * .. Parameters .. - DOUBLE PRECISION ZERO - PARAMETER ( ZERO = 0.0D+0 ) + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, + INTEGER I,II, IB, IINFO, IWS, J,JJ, KI, KK, L, LDWORK, $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL DLARFB, DLARFT, DORG2R, XERBLA + EXTERNAL DLARFB0C2, DLARFT, DORG2R, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -164,7 +167,9 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'DORGQR', ' ', M, N, K, -1 ) - LWKOPT = MAX( 1, N )*NB + ! Only need a workspace for dorg2r in case of bailout and + ! for the panel factorization + LWKOPT = MAX( 1, N ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -192,96 +197,129 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) RETURN END IF * + ! Probably not needed anymore NBMIN = 2 + ! Parameter that controls when we cross from blocked to + ! unblocked NX = 0 - IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. -* - NX = MAX( 0, ILAENV( 3, 'DORGQR', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* Use blocked code after the last block. +* The first kk columns are handled by the block method. * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KI = K - 2 * NB + KK = K - NB + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Use unblocked code for the only block. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'DORGQR', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL DORG2R( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN + I = KK + 1 + IB = NB + ! This is a specialized form of our loop below. We could make this its + ! own function, however this is a specialized step, so currently we + ! don't do that. * -* Use blocked code after the last block. -* The first kk columns are handled by the block method. +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) + CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * -* Set A(1:kk,kk+1:n) to zero. +* Apply H to A(i:m,i+ib:n) from the left * - DO 20 J = KK + 1, N - DO 10 I = 1, KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF * -* Use unblocked code for the last or only block. +** W := V2 +* C1 := V2**T * - IF( KK.LT.N ) - $ CALL DORG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) +* Since C1 starts as 0, we are using this instead of WORK(IB+1). +* This helps us reduce the memory footprint by lowering WORK to +* be of only size IB +* CALL DLACPY('All', N-K, IB, A(I+IB,I), LDA,WORK(IB+1),LDWORK) + DO JJ = K - NB + 1, K + DO II = K + 1, N + A( JJ, II ) = A( II, JJ) + END DO + END DO * - IF( KK.GT.0 ) THEN +* C1 := T * C1 +* + CALL DTRMM( 'Left', 'Upper', 'No transpose', 'Non-unit', IB, + $ N-K,ONE, A(I,I), LDA, A(I,I+IB),LDA ) +* +* C2 := C2 - V2 * C1 +* + CALL DGEMM( 'No transpose', 'No transpose', M-IB-KK, N-K, IB, + $ -ONE, A( I+IB, I ), LDA, A(I,I+IB),LDA, ZERO, + $ A( I+IB, I+IB ), LDA ) + DO JJ = 1, N-K + A(I+IB+JJ-1,I+IB+JJ-1) = 1 + A(I+IB+JJ-1,I+IB+JJ-1) + END DO +* +* C1 := -V1 * C1 +* + CALL DTRMM( 'Left', 'Lower', 'No transpose', 'Unit', IB, N-K, + $ -ONE, A(I,I), LDA, A(I,I+IB),LDA ) * -* Use blocked code +* Apply H to rows i:m of current block * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.N ) THEN + CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO) + DO I = KI + 1, 1, -NB + IB = NB * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - CALL DLARFT( 'Forward', 'Columnwise', M-I+1, IB, - $ A( I, I ), LDA, TAU( I ), WORK, LDWORK ) + CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * -* Apply H to A(i:m,i+ib:n) from the left +* Apply H to A(i:m,i+ib:n) from the left * - CALL DLARFB( 'Left', 'No transpose', 'Forward', - $ 'Columnwise', M-I+1, N-I-IB+1, IB, - $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ), - $ LDA, WORK( IB+1 ), LDWORK ) - END IF + CALL DLARFB0C2('A', 'A', 'Forward', 'Column', M-I+1, + $ N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), LDA, + $ A(I,I+IB), LDA) + * * Apply H to rows i:m of current block * - CALL DORG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + $ IINFO) + END DO +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 * -* Set rows 1:i-1 of current block to zero +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 40 J = I, I + IB - 1 - DO 30 L = 1, I - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) +* +* Apply H to A(i:m,i+ib:n) from the left +* + CALL DLARFB0C2('A', 'A', 'Forward', 'Column', M-I+1, + $ N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), LDA, + $ A(I,I+IB),LDA) + +* +* Apply H to rows i:m of current block +* + CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + $ IINFO) + END IF END IF * - WORK( 1 ) = IWS +* WORK( 1 ) = IWS RETURN * * End of DORGQR diff --git a/SRC/dorgrq.f b/SRC/dorgrq.f index c805484578..114891a7c1 100644 --- a/SRC/dorgrq.f +++ b/SRC/dorgrq.f @@ -149,7 +149,7 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL DLARFB, DLARFT, DORGR2, XERBLA + EXTERNAL DLARFB0C2, DLARFT, DORGR2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -266,10 +266,9 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * - CALL DLARFB( 'Right', 'Transpose', 'Backward', - $ 'Rowwise', - $ II-1, N-K+I+IB-1, IB, A( II, 1 ), LDA, WORK, - $ LDWORK, A, LDA, WORK( IB+1 ), LDWORK ) + CALL DLARFB0C2( 'A', 'A', 'Backward', 'Rowwise', + $ II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK, LDWORK, + $ A, LDA) END IF * * Apply H**T to columns 1:n-k+i+ib-1 of current block @@ -277,14 +276,6 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) CALL DORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, $ TAU( I ), $ WORK, IINFO ) -* -* Set columns n-k+i+ib:n of current block to zero -* - DO 40 L = N - K + I + IB, N - DO 30 J = II, II + IB - 1 - A( J, L ) = ZERO - 30 CONTINUE - 40 CONTINUE 50 CONTINUE END IF * diff --git a/SRC/slarfb0c2.f b/SRC/slarfb0c2.f new file mode 100644 index 0000000000..5d235bc011 --- /dev/null +++ b/SRC/slarfb0c2.f @@ -0,0 +1,263 @@ + SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, + $ LDV, T, LDT, C, LDC) + ! Scalar arguments + INTEGER M, N, K, LDV, LDC, LDT + ! Note: We should probably get rid of SIDE and TRANS as these flags + ! are not used as we are only using this routine inside + ! x{OR,UN}G{QR,LQ,QR,LQ}, which have values that never change + CHARACTER SIDE, TRANS, DIRECT, STOREV + ! Array arguments + REAL V(LDV,*), C(LDC,*), T(LDT,*) + ! Local scalars + LOGICAL QR, LQ, QL, DIRF, COLV + ! External subroutines + EXTERNAL SGEMM, STRMM + ! External functions + LOGICAL LSAME + EXTERNAL LSAME + ! Parameters + REAL ONE, ZERO, NEG_ONE + PARAMETER(ONE=1.0E+0, ZERO = 0.0E+0, NEG_ONE = -1.0E+0) + + ! Beginning of executable statements + ! Convert our character flags to logical values + DIRF = LSAME(DIRECT,'F') + COLV = LSAME(STOREV,'C') + + ! Determine which of the 4 modes are using. + ! QR is when we store the reflectors column by column and have the + ! 'first' reflector stored in the first column + QR = DIRF.AND.COLV + + ! LQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the first row + LQ = DIRF.AND.(.NOT.COLV) + + ! QL is when we store the reflectors column by column and have the + ! 'first' reflector stored in the last column + QL = (.NOT.DIRF).AND.COLV + + ! RQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the last row + ! RQ = (.NOT.DIRF).AND.(.NOT.COLV) + ! Since we have exactly one of these 4 modes, we don't need to actually + ! store the value of RQ, instead we assume this is the case if we fail + ! the above 3 checks. + + IF (QR) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V1 ] and C = [ C1 ] + ! [ V2 ] [ C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ] + ! [ V2 ] [ C2 ] + ! = [ 0 ] - [ V1 ]T*V2'*C2 + ! [ C2 ] [ V2 ] + ! + ! = [ 0 ] - [ V1*T*V2'*C2 ] + ! [ C2 ] [ V2*T*V2'*C2 ] + ! + ! = [ V1*T*V2'*C2 ] + ! [ C2 - V2*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! C1 = V2'*C2 + ! + CALL SGEMM('Transpose', 'No Transpose', K, N, M - K, + $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, + $ C, LDC) + ! + ! C1 = T*C1 + ! + CALL STRMM('Left', 'Upper', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, + $ C(K+1,1), LDC) + ! + ! C1 = -V1*C1 + ! + CALL STRMM('Left', 'Lower', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V, LDV, C, LDC) + ELSE IF (LQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V1 V2 ] and C = [ C1 C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ]) + ! [ V2' ] + ! + ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ] + ! [ V2' ] + ! + ! = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ] + ! + ! = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! To achieve the same end result + ! + ! C1 = C2*V2' + ! + CALL SGEMM('No Transpose', 'Transpose', M, K, N-K, + $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, LDC) + ! + ! C1 = C1*T' + ! + CALL STRMM('Right', 'Upper', 'Transpose', 'Non-unit', + $ M, K, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, + $ C(1,K+1), LDC) + ! + ! C1 = -C1*V1 + ! + CALL STRMM('Right', 'Upper', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V, LDV, C, LDC) + ELSE IF (QL) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V2 ] and C = [ C2 ] + ! [ V1 ] [ C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ] + ! [ V1 ] [ 0 ] + ! + ! = [ C2 ] - [ V2 ]T*V2'*C2 + ! [ 0 ] [ V1 ] + ! + ! = [ C2 ] - [ V2*T*V2'*C2 ] + ! [ 0 ] [ V1*T*V2'*C2 ] + ! + ! = [ C2 - V2*T*V2'*C2 ] + ! [ - V1*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! C1 = V2'*C2 + ! + CALL SGEMM('Transpose', 'No Transpose', K, N, M-K, + $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC) + ! + ! C1 = T*C1 + ! + CALL STRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C(M-K+1,1), LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC) + ! + ! C1 = -V1*C1 + ! + CALL STRMM('Left', 'Upper', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC) + ELSE ! IF (RQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V2 V1] and C = [ C2 C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ] + ! + ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] + ! + ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! + ! To achieve the same end result + ! + ! C1 = C2*V2' + ! + CALL SGEMM('No Transpose', 'Transpose', M, K, N-K, + $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC) + ! + ! C1 = C1*T' + ! + CALL STRMM('Right', 'Lower', 'Transpose', 'Non-unit', + $ M, K, ONE, T, LDT, C(1, N-K+1), LDC) + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC) + ! + ! C1 = -C1*V1 + ! + CALL STRMM('Right', 'Lower', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC) + END IF + END SUBROUTINE From 09a04d7931557c921f906d1784844fa8ddc30738 Mon Sep 17 00:00:00 2001 From: Johnathan Rhyne Date: Sun, 9 Feb 2025 10:51:51 -0700 Subject: [PATCH 02/10] Real version of xorgqr using new larfb implemented and passing tests locally --- SRC/CMakeLists.txt | 2 +- SRC/Makefile | 2 +- SRC/dlarfb0c2.f | 182 ++++++++++++++++++++++++++++++++++++--------- SRC/dorglq.f | 96 ++++++++++++++++-------- SRC/dorgql.f | 103 +++++++++++++------------ SRC/dorgqr.f | 93 ++++++++--------------- SRC/dorgrq.f | 76 +++++++++++-------- SRC/slarfb0c2.f | 182 ++++++++++++++++++++++++++++++++++++--------- SRC/sorglq.f | 107 +++++++++++++++----------- SRC/sorgql.f | 108 ++++++++++++--------------- SRC/sorgqr.f | 126 +++++++++++++++---------------- SRC/sorgrq.f | 85 +++++++++++---------- 12 files changed, 713 insertions(+), 449 deletions(-) diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 88e503efea..12cfebca5f 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -106,7 +106,7 @@ set(SLASRC slaqgb.f slaqge.f slaqp2.f slaqps.f slaqp2rk.f slaqp3rk.f slaqsb.f slaqsp.f slaqsy.f slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f - slarf.f slarf1f.f slarf1l.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f + slarf.f slarf1f.f slarf1l.f slarfb.f slarfb0c2.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f slarmm.f slarrv.f slartv.f slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f diff --git a/SRC/Makefile b/SRC/Makefile index dc8f472092..5fb5db20a6 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -137,7 +137,7 @@ SLASRC = \ slaqgb.o slaqge.o slaqp2.o slaqps.o slaqp2rk.o slaqp3rk.o slaqsb.o slaqsp.o slaqsy.o \ slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \ - slarf.o slarf1f.o slarf1l.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o \ + slarf.o slarf1f.o slarf1l.o slarfb.o slarfb0c2.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o \ slargv.o slarmm.o slarrv.o slartv.o \ slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \ slasyf_rk.o \ diff --git a/SRC/dlarfb0c2.f b/SRC/dlarfb0c2.f index cb7f9d3873..4651cdbea4 100644 --- a/SRC/dlarfb0c2.f +++ b/SRC/dlarfb0c2.f @@ -1,20 +1,24 @@ - SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, - $ LDV, T, LDT, C, LDC) + SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, + $ K, V, LDV, T, LDT, C, LDC) ! Scalar arguments INTEGER M, N, K, LDV, LDC, LDT - ! Note: We should probably get rid of SIDE and TRANS as these flags - ! are not used as we are only using this routine inside - ! x{OR,UN}G{QR,LQ,QR,LQ}, which have values that never change CHARACTER SIDE, TRANS, DIRECT, STOREV + ! True means that we are assuming C2 is the identity matrix + ! and thus don't reference whatever is present in C2 + ! at the beginning. + LOGICAL C2I + ! Array arguments DOUBLE PRECISION V(LDV,*), C(LDC,*), T(LDT,*) ! Local scalars - LOGICAL QR, LQ, QL, DIRF, COLV - ! External subroutines - EXTERNAL DGEMM, DTRMM + LOGICAL QR, LQ, QL, DIRF, COLV, SIDEL, SIDER, + $ TRANST + INTEGER I, J ! External functions LOGICAL LSAME EXTERNAL LSAME + ! External subroutines + EXTERNAL DGEMM, DTRMM, XERBLA ! Parameters DOUBLE PRECISION ONE, ZERO, NEG_ONE PARAMETER(ONE=1.0D+0, ZERO = 0.0D+0, NEG_ONE = -1.0D+0) @@ -23,6 +27,9 @@ SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, ! Convert our character flags to logical values DIRF = LSAME(DIRECT,'F') COLV = LSAME(STOREV,'C') + SIDEL = LSAME(SIDE,'L') + SIDER = LSAME(SIDE,'R') + TRANST = LSAME(TRANS,'T') ! Determine which of the 4 modes are using. ! QR is when we store the reflectors column by column and have the @@ -78,26 +85,53 @@ SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, ! ! To achieve the same end result ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! ! C1 = V2'*C2 ! - CALL DGEMM('Transpose', 'No Transpose', K, N, M - K, - $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, - $ C, LDC) + IF (C2I) THEN + DO J = 1, N + DO I = 1, K + C(I,J) = V(K+J,I) + END DO + END DO + ELSE + CALL DGEMM('Transpose', 'No Transpose', K, N, M - K, + $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, + $ C, LDC) + END IF ! ! C1 = T*C1 ! - CALL DTRMM('Left', 'Upper', 'No Transpose', 'Non-unit', + CALL DTRMM('Left', 'Upper', 'No Transpose', 'Non-unit', $ K, N, ONE, T, LDT, C, LDC) ! ! C2 = C2 - V2*C1 = -V2*C1 + C2 ! - CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, - $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, - $ C(K+1,1), LDC) + IF (C2I) THEN + CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO, + $ C(K+1,1), LDC) + DO I = 1, N + C(K+I,I) = C(K+I,I) + ONE + END DO + ELSE + CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, + $ C(K+1,1), LDC) + END IF ! ! C1 = -V1*C1 ! - CALL DTRMM('Left', 'Lower', 'No Transpose', 'Unit', + CALL DTRMM('Left', 'Lower', 'No Transpose', 'Unit', $ K, N, NEG_ONE, V, LDV, C, LDC) ELSE IF (LQ) THEN ! We are computing C = CH' = C(I-V'T'V) @@ -131,10 +165,29 @@ SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, ! ! To achieve the same end result ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(.NOT.TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! ! C1 = C2*V2' ! - CALL DGEMM('No Transpose', 'Transpose', M, K, N-K, - $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, LDC) + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,J) = V(J,K+I) + END DO + END DO + ELSE + CALL DGEMM('No Transpose', 'Transpose', M, K, N-K, + $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, + $ LDC) + END IF ! ! C1 = C1*T' ! @@ -143,9 +196,18 @@ SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, ! ! C2 = C2 - C1*V2 = -C1*V2 + C2 ! - CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, - $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, - $ C(1,K+1), LDC) + IF( C2I ) THEN + CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1), + $ LDC) + DO I = 1, M + C(I,K+I) = C(I,K+I) + ONE + END DO + ELSE + CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1), + $ LDC) + END IF ! ! C1 = -C1*V1 ! @@ -186,10 +248,28 @@ SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, ! ! To achieve the same end result ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! ! C1 = V2'*C2 ! - CALL DGEMM('Transpose', 'No Transpose', K, N, M-K, - $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC) + IF( C2I ) THEN + DO J = 1, N + DO I = 1, K + C(M-K+I,J) = V(J,I) + END DO + END DO + ELSE + CALL DGEMM('Transpose', 'No Transpose', K, N, M-K, + $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC) + END IF ! ! C1 = T*C1 ! @@ -198,12 +278,20 @@ SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, ! ! C2 = C2 - V2*C1 = -V2*C1 + C2 ! - CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, - $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC) + IF( C2I ) THEN + CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC) + DO I = 1, N + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC) + END IF ! ! C1 = -V1*C1 ! - CALL DTRMM('Left', 'Upper', 'No Transpose', 'Unit', + CALL DTRMM('Left', 'Upper', 'No Transpose', 'Unit', $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC) ELSE ! IF (RQ) THEN ! We are computing C = CH' = C(I-V'T'V) @@ -226,7 +314,7 @@ SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, ! ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ] ! - ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] + ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] ! ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ] ! @@ -236,28 +324,54 @@ SUBROUTINE DLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, ! C1 = C1*T' ! C2 = C2 - C1*V2 ! C1 = -C1*V1 - ! + ! ! ! To achieve the same end result ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(.NOT.TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! ! C1 = C2*V2' ! - CALL DGEMM('No Transpose', 'Transpose', M, K, N-K, - $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC) + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,N-K+J) = V(J,I) + END DO + END DO + ELSE + CALL DGEMM('No Transpose', 'Transpose', M, K, N-K, + $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC) + END IF ! ! C1 = C1*T' ! - CALL DTRMM('Right', 'Lower', 'Transpose', 'Non-unit', + CALL DTRMM('Right', 'Lower', 'Transpose', 'Non-unit', $ M, K, ONE, T, LDT, C(1, N-K+1), LDC) ! ! C2 = C2 - C1*V2 = -C1*V2 + C2 ! - CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, - $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC) + IF( C2I ) THEN + CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC) + DO I = 1, M + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC) + END IF ! ! C1 = -C1*V1 ! - CALL DTRMM('Right', 'Lower', 'No Transpose', 'Unit', + CALL DTRMM('Right', 'Lower', 'No Transpose', 'Unit', $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC) END IF END SUBROUTINE diff --git a/SRC/dorglq.f b/SRC/dorglq.f index b686cc468a..6507f625b3 100644 --- a/SRC/dorglq.f +++ b/SRC/dorglq.f @@ -5,6 +5,7 @@ * Online html documentation available at * http://www.netlib.org/lapack/explore-html/ * +*> \htmlonly *> Download DORGLQ + dependencies *> *> [TGZ] @@ -12,6 +13,7 @@ *> [ZIP] *> *> [TXT] +*> \endhtmlonly * * Definition: * =========== @@ -161,8 +163,7 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'DORGLQ', ' ', M, N, K, -1 ) - ! DLARFB0C2 means we only need a workspace for calls to dorgl2 - LWKOPT = MAX( 1, M ) + LWKOPT = MAX( 1, M ) * NB WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -218,55 +219,90 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Use blocked code after the last block. -* The first kk rows are handled by the block method. -* - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) -* -* Set A(kk+1:m,1:kk) to zero. +* Handle the first block assuming we are applying to the +* identity, then resume regular blocking method after * + KI = K - 2 * NB + KK = K - NB ELSE KK = 0 END IF * -* Use unblocked code for the last or only block. +* Potentially bail to the unblocked version * - IF( KK.LT.M ) - $ CALL DORGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL DORGL2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN + I = KK + 1 + IB = NB +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) +* +* Apply H**T to A(i+ib:m,i:n) from the right +* + CALL DLARFB0C2(.TRUE., 'Right', 'Transpose', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + + CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK, IINFO ) * * Use blocked code * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.M ) THEN + DO I = KI + 1, 1, -NB + IB = NB * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, - $ I ), - $ LDA, TAU( I ), WORK, LDWORK ) + CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * -* Apply H**T to A(i+ib:m,i:n) from the right +* Apply H**T to A(i+ib:m,i:n) from the right * - CALL DLARFB0C2('A', 'A', 'Forward', 'Rowwise', - $ M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, LDWORK, - $ A(I+IB,I), LDA) - END IF + CALL DLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) * * Apply H**T to columns i:n of current block CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + $ WORK, IINFO ) + END DO * -* Set columns 1:i-1 of current block to zero +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary * - 50 CONTINUE + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) +* +* Apply H**T to A(i+ib:m,i:n) from the right +* + CALL DLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + + CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK, IINFO ) + END IF END IF * WORK( 1 ) = IWS diff --git a/SRC/dorgql.f b/SRC/dorgql.f index 892c91b855..9f34c858f9 100644 --- a/SRC/dorgql.f +++ b/SRC/dorgql.f @@ -5,6 +5,7 @@ * Online html documentation available at * http://www.netlib.org/lapack/explore-html/ * +*> \htmlonly *> Download DORGQL + dependencies *> *> [TGZ] @@ -12,6 +13,7 @@ *> [ZIP] *> *> [TXT] +*> \endhtmlonly * * Definition: * =========== @@ -143,8 +145,7 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT, - $ NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, J, KK, LWKOPT, NB, NBMIN * .. * .. External Subroutines .. EXTERNAL DLARFB0C2, DLARFT, DORG2L, XERBLA @@ -177,7 +178,8 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) LWKOPT = 1 ELSE NB = ILAENV( 1, 'DORGQL', ' ', M, N, K, -1 ) - LWKOPT = N*NB + ! Only need a workspace for calls to dorg2l + LWKOPT = N END IF WORK( 1 ) = LWKOPT * @@ -200,82 +202,77 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. + IF( NB.GE.NBMIN .AND. NB.LT.K ) THEN * - NX = MAX( 0, ILAENV( 3, 'DORGQL', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN +* We want to use the blocking method as long as our matrix is big enough * -* Determine if workspace is large enough for blocked code. -* - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KK = K + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Possibly bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'DORGQL', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL DORG2L( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN +* Use our blocked code for everything * -* Use blocked code after the first block. -* The last kk columns are handled by the block method. + IF( KK.GT.0 ) THEN * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) +* Factor the first block assuming that our first application +* will be on the Identity matrix * -* Set A(m-kk+1:m,1:n-kk) to zero. + I = 1 + IB = NB * - DO 20 J = 1, N - KK - DO 10 I = M - KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * -* Use unblocked code for the first or only block. + CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA) * - CALL DORG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left * - IF( KK.GT.0 ) THEN + CALL DLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', + $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), + $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA) +* +* Apply H to rows 1:m-k+i+ib-1 of current block +* + CALL DORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, + $ TAU( I ), WORK, IINFO ) + +* Use blocked code on the remaining blocks if there are any. * -* Use blocked code + DO I = NB+1, K, NB * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - IF( N-K+I.GT.1 ) THEN +* The last block may be less than size NB * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) + IB = MIN(NB, K-I+1) * - CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, - $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK ) +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * -* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left + CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA ) * - CALL DLARFB0C2('A', 'A', 'Backward', 'Columnwise', - $ M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), LDA, - $ WORK, LDWORK, A, LDA) - END IF +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* + CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, + $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * * Apply H to rows 1:m-k+i+ib-1 of current block * CALL DORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, $ TAU( I ), WORK, IINFO ) -* -* Set rows m-k+i+ib:m of current block to zero -* - 50 CONTINUE + END DO END IF * WORK( 1 ) = IWS diff --git a/SRC/dorgqr.f b/SRC/dorgqr.f index 5fae768438..0ed278ffc0 100644 --- a/SRC/dorgqr.f +++ b/SRC/dorgqr.f @@ -1,5 +1,3 @@ -c -c *> \brief \b DORGQR * * =========== DOCUMENTATION =========== @@ -7,6 +5,7 @@ * Online html documentation available at * http://www.netlib.org/lapack/explore-html/ * +*> \htmlonly *> Download DORGQR + dependencies *> *> [TGZ] @@ -14,6 +13,7 @@ *> [ZIP] *> *> [TXT] +*> \endhtmlonly * * Definition: * =========== @@ -125,7 +125,6 @@ * * ===================================================================== SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) - IMPLICIT NONE * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -141,13 +140,13 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * ===================================================================== * * .. Parameters .. - DOUBLE PRECISION ZERO, ONE - PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I,II, IB, IINFO, IWS, J,JJ, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, J, KI, KK, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL DLARFB0C2, DLARFT, DORG2R, XERBLA @@ -165,7 +164,7 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'DORGQR', ' ', M, N, K, -1 ) - ! Only need a workspace for dorg2r in case of bailout and + ! Only need a workspace for dorg2r in case of bailout and ! for the panel factorization LWKOPT = MAX( 1, N ) WORK( 1 ) = LWKOPT @@ -195,16 +194,14 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) RETURN END IF * - ! Probably not needed anymore NBMIN = 2 - ! Parameter that controls when we cross from blocked to - ! unblocked - NX = 0 + NX = MAX(0, ILAENV(3, 'SORGQR', ' ', M, N, K, -1)) + IWS = N * IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Use blocked code after the last block. -* The first kk columns are handled by the block method. +* Handle the first block assuming we are applying to the +* identity, then resume regular blocking method after * KI = K - 2 * NB KK = K - NB @@ -212,59 +209,27 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) KK = 0 END IF * -* Use unblocked code for the only block. +* Potentially bail to the unblocked code. * IF( KK.EQ.0 ) THEN - CALL DORG2R( M, N, K, A, LDA, TAU, WORK, IINFO ) + CALL DORG2R( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * IF( KK.GT.0 ) THEN I = KK + 1 IB = NB - ! This is a specialized form of our loop below. We could make this its - ! own function, however this is a specialized step, so currently we - ! don't do that. * * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * - CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), $ LDA, TAU(I), A(I,I), LDA) * * Apply H to A(i:m,i+ib:n) from the left * -* -** W := V2 -* C1 := V2**T -* -* Since C1 starts as 0, we are using this instead of WORK(IB+1). -* This helps us reduce the memory footprint by lowering WORK to -* be of only size IB -* CALL DLACPY('All', N-K, IB, A(I+IB,I), LDA,WORK(IB+1),LDWORK) - DO JJ = K - NB + 1, K - DO II = K + 1, N - A( JJ, II ) = A( II, JJ) - END DO - END DO -* -* C1 := T * C1 -* - CALL DTRMM( 'Left', 'Upper', 'No transpose', 'Non-unit', IB, - $ N-K,ONE, A(I,I), LDA, A(I,I+IB),LDA ) -* -* C2 := C2 - V2 * C1 -* - CALL DGEMM( 'No transpose', 'No transpose', M-IB-KK, N-K, IB, - $ -ONE, A( I+IB, I ), LDA, A(I,I+IB),LDA, ZERO, - $ A( I+IB, I+IB ), LDA ) - DO JJ = 1, N-K - A(I+IB+JJ-1,I+IB+JJ-1) = 1 + A(I+IB+JJ-1,I+IB+JJ-1) - END DO -* -* C1 := -V1 * C1 -* - CALL DTRMM( 'Left', 'Lower', 'No transpose', 'Unit', IB, N-K, - $ -ONE, A(I,I), LDA, A(I,I+IB),LDA ) + CALL DLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward', + $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), + $ LDA, A(I,I+IB), LDA) * * Apply H to rows i:m of current block * @@ -275,24 +240,26 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * - CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), $ LDA, TAU(I), A(I,I), LDA) * * Apply H to A(i:m,i+ib:n) from the left * - CALL DLARFB0C2('A', 'A', 'Forward', 'Column', M-I+1, - $ N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), LDA, - $ A(I,I+IB), LDA) + CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) * * Apply H to rows i:m of current block * - CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, $ IINFO) END DO +* * This checks for if K was a perfect multiple of NB * so that we only have a special case for the last block when * necessary +* IF(I.LT.1) THEN IB = I + NB - 1 I = 1 @@ -300,24 +267,24 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * - CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), $ LDA, TAU(I), A(I,I), LDA) * * Apply H to A(i:m,i+ib:n) from the left * - CALL DLARFB0C2('A', 'A', 'Forward', 'Column', M-I+1, - $ N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), LDA, - $ A(I,I+IB),LDA) + CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) * * Apply H to rows i:m of current block * - CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + CALL DORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, $ IINFO) END IF END IF * -* WORK( 1 ) = IWS + WORK( 1 ) = IWS RETURN * * End of DORGQR diff --git a/SRC/dorgrq.f b/SRC/dorgrq.f index ccaad3b433..ce61238776 100644 --- a/SRC/dorgrq.f +++ b/SRC/dorgrq.f @@ -5,6 +5,7 @@ * Online html documentation available at * http://www.netlib.org/lapack/explore-html/ * +*> \htmlonly *> Download DORGRQ + dependencies *> *> [TGZ] @@ -12,6 +13,7 @@ *> [ZIP] *> *> [TXT] +*> \endhtmlonly * * Definition: * =========== @@ -227,54 +229,70 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Use blocked code after the first block. -* The last kk rows are handled by the block method. +* We want to use the blocking method as long as our matrix is big enough +* and it's deemed worthwhile with the extra memory allocations * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) -* -* Set A(1:m-kk,n-kk+1:n) to zero. -* - DO 20 J = N - KK + 1, N - DO 10 I = 1, M - KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KK = K ELSE KK = 0 END IF * -* Use unblocked code for the first or only block. +* Potentially bail to the unblocked code * - CALL DORGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL DORGR2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN * -* Use blocked code +* Factor the first block assuming that our first application +* will be on the Identity matrix +* + I = 1 + IB = NB + II = M - K + I +* +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) +* + CALL DLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) +* +* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) + CALL DLARFB0C2(.TRUE., 'Right', 'Transpose', 'Backward', + $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK, + $ LDWORK, A, LDA) +* +* Apply H**T to columns 1:n-k+i+ib-1 of current block +* + CALL DORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, + $ TAU( I ), WORK, IINFO ) + + DO I = NB + 1, K, NB +* +* The last block may be less than size NB +* + IB = MIN(NB, K-I+1) II = M - K + I - IF( II.GT.1 ) THEN * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL DLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, - $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) + CALL DLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) * -* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * - CALL DLARFB0C2( 'A', 'A', 'Backward', 'Rowwise', - $ II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK, LDWORK, - $ A, LDA) - END IF + CALL DLARFB0C2(.FALSE., 'Right', 'Transpose', + $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, + $ A(II,1), LDA, WORK, LDWORK, A, LDA) * * Apply H**T to columns 1:n-k+i+ib-1 of current block * CALL DORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, - $ TAU( I ), - $ WORK, IINFO ) - 50 CONTINUE + $ TAU( I ), WORK, IINFO ) + END DO END IF * WORK( 1 ) = IWS diff --git a/SRC/slarfb0c2.f b/SRC/slarfb0c2.f index 5d235bc011..74a31f6f76 100644 --- a/SRC/slarfb0c2.f +++ b/SRC/slarfb0c2.f @@ -1,20 +1,24 @@ - SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, - $ LDV, T, LDT, C, LDC) + SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, + $ K, V, LDV, T, LDT, C, LDC) ! Scalar arguments INTEGER M, N, K, LDV, LDC, LDT - ! Note: We should probably get rid of SIDE and TRANS as these flags - ! are not used as we are only using this routine inside - ! x{OR,UN}G{QR,LQ,QR,LQ}, which have values that never change CHARACTER SIDE, TRANS, DIRECT, STOREV + ! True means that we are assuming C2 is the identity matrix + ! and thus don't reference whatever is present in C2 + ! at the beginning. + LOGICAL C2I + ! Array arguments REAL V(LDV,*), C(LDC,*), T(LDT,*) ! Local scalars - LOGICAL QR, LQ, QL, DIRF, COLV - ! External subroutines - EXTERNAL SGEMM, STRMM + LOGICAL QR, LQ, QL, DIRF, COLV, SIDEL, SIDER, + $ TRANST + INTEGER I, J ! External functions LOGICAL LSAME EXTERNAL LSAME + ! External Subroutines + EXTERNAL SGEMM, STRMM, XERBLA ! Parameters REAL ONE, ZERO, NEG_ONE PARAMETER(ONE=1.0E+0, ZERO = 0.0E+0, NEG_ONE = -1.0E+0) @@ -23,6 +27,9 @@ SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, ! Convert our character flags to logical values DIRF = LSAME(DIRECT,'F') COLV = LSAME(STOREV,'C') + SIDEL = LSAME(SIDE,'L') + SIDER = LSAME(SIDE,'R') + TRANST = LSAME(TRANS,'T') ! Determine which of the 4 modes are using. ! QR is when we store the reflectors column by column and have the @@ -78,26 +85,53 @@ SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, ! ! To achieve the same end result ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! ! C1 = V2'*C2 ! - CALL SGEMM('Transpose', 'No Transpose', K, N, M - K, - $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, - $ C, LDC) + IF (C2I) THEN + DO J = 1, N + DO I = 1, K + C(I,J) = V(K+J,I) + END DO + END DO + ELSE + CALL SGEMM('Transpose', 'No Transpose', K, N, M - K, + $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, + $ C, LDC) + END IF ! ! C1 = T*C1 ! - CALL STRMM('Left', 'Upper', 'No Transpose', 'Non-unit', + CALL STRMM('Left', 'Upper', 'No Transpose', 'Non-unit', $ K, N, ONE, T, LDT, C, LDC) ! ! C2 = C2 - V2*C1 = -V2*C1 + C2 ! - CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, - $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, - $ C(K+1,1), LDC) + IF (C2I) THEN + CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO, + $ C(K+1,1), LDC) + DO I = 1, N + C(K+I,I) = C(K+I,I) + ONE + END DO + ELSE + CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, + $ C(K+1,1), LDC) + END IF ! ! C1 = -V1*C1 ! - CALL STRMM('Left', 'Lower', 'No Transpose', 'Unit', + CALL STRMM('Left', 'Lower', 'No Transpose', 'Unit', $ K, N, NEG_ONE, V, LDV, C, LDC) ELSE IF (LQ) THEN ! We are computing C = CH' = C(I-V'T'V) @@ -131,10 +165,29 @@ SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, ! ! To achieve the same end result ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(.NOT.TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! ! C1 = C2*V2' ! - CALL SGEMM('No Transpose', 'Transpose', M, K, N-K, - $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, LDC) + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,J) = V(J,K+I) + END DO + END DO + ELSE + CALL SGEMM('No Transpose', 'Transpose', M, K, N-K, + $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, + $ LDC) + END IF ! ! C1 = C1*T' ! @@ -143,9 +196,18 @@ SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, ! ! C2 = C2 - C1*V2 = -C1*V2 + C2 ! - CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, - $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, - $ C(1,K+1), LDC) + IF( C2I ) THEN + CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1), + $ LDC) + DO I = 1, M + C(I,K+I) = C(I,K+I) + ONE + END DO + ELSE + CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1), + $ LDC) + END IF ! ! C1 = -C1*V1 ! @@ -186,10 +248,28 @@ SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, ! ! To achieve the same end result ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! ! C1 = V2'*C2 ! - CALL SGEMM('Transpose', 'No Transpose', K, N, M-K, - $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC) + IF( C2I ) THEN + DO J = 1, N + DO I = 1, K + C(M-K+I,J) = V(J,I) + END DO + END DO + ELSE + CALL SGEMM('Transpose', 'No Transpose', K, N, M-K, + $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC) + END IF ! ! C1 = T*C1 ! @@ -198,12 +278,20 @@ SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, ! ! C2 = C2 - V2*C1 = -V2*C1 + C2 ! - CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, - $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC) + IF( C2I ) THEN + CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC) + DO I = 1, N + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC) + END IF ! ! C1 = -V1*C1 ! - CALL STRMM('Left', 'Upper', 'No Transpose', 'Unit', + CALL STRMM('Left', 'Upper', 'No Transpose', 'Unit', $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC) ELSE ! IF (RQ) THEN ! We are computing C = CH' = C(I-V'T'V) @@ -226,7 +314,7 @@ SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, ! ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ] ! - ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] + ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] ! ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ] ! @@ -236,28 +324,54 @@ SUBROUTINE SLARFB0C2(SIDE, TRANS, DIRECT, STOREV, M, N, K, V, ! C1 = C1*T' ! C2 = C2 - C1*V2 ! C1 = -C1*V1 - ! + ! ! ! To achieve the same end result ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(.NOT.TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! ! C1 = C2*V2' ! - CALL SGEMM('No Transpose', 'Transpose', M, K, N-K, - $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC) + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,N-K+J) = V(J,I) + END DO + END DO + ELSE + CALL SGEMM('No Transpose', 'Transpose', M, K, N-K, + $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC) + END IF ! ! C1 = C1*T' ! - CALL STRMM('Right', 'Lower', 'Transpose', 'Non-unit', + CALL STRMM('Right', 'Lower', 'Transpose', 'Non-unit', $ M, K, ONE, T, LDT, C(1, N-K+1), LDC) ! ! C2 = C2 - C1*V2 = -C1*V2 + C2 ! - CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, - $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC) + IF( C2I ) THEN + CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC) + DO I = 1, M + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC) + END IF ! ! C1 = -C1*V1 ! - CALL STRMM('Right', 'Lower', 'No Transpose', 'Unit', + CALL STRMM('Right', 'Lower', 'No Transpose', 'Unit', $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC) END IF END SUBROUTINE diff --git a/SRC/sorglq.f b/SRC/sorglq.f index fb5ef79cce..574b3ae2c1 100644 --- a/SRC/sorglq.f +++ b/SRC/sorglq.f @@ -146,7 +146,7 @@ SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL SLARFB, SLARFT, SORGL2, XERBLA + EXTERNAL SLARFB0C2, SLARFT, SORGL2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -218,67 +218,90 @@ SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Use blocked code after the last block. -* The first kk rows are handled by the block method. +* Handle the first block assuming we are applying to the +* identity, then resume regular blocking method after * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) -* -* Set A(kk+1:m,1:kk) to zero. -* - DO 20 J = 1, KK - DO 10 I = KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KI = K - 2 * NB + KK = K - NB ELSE KK = 0 END IF * -* Use unblocked code for the last or only block. +* Potentially bail to the unblocked version * - IF( KK.LT.M ) - $ CALL SORGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL SORGL2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN + I = KK + 1 + IB = NB +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) +* +* Apply H**T to A(i+ib:m,i:n) from the right +* + CALL SLARFB0C2(.TRUE., 'Right', 'Transpose', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + + CALL SORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK, IINFO ) * * Use blocked code * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.M ) THEN + DO I = KI + 1, 1, -NB + IB = NB * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, - $ I ), - $ LDA, TAU( I ), WORK, LDWORK ) + CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * -* Apply H**T to A(i+ib:m,i:n) from the right +* Apply H**T to A(i+ib:m,i:n) from the right * - CALL SLARFB( 'Right', 'Transpose', 'Forward', - $ 'Rowwise', - $ M-I-IB+1, N-I+1, IB, A( I, I ), LDA, WORK, - $ LDWORK, A( I+IB, I ), LDA, WORK( IB+1 ), - $ LDWORK ) - END IF + CALL SLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) * * Apply H**T to columns i:n of current block -* + CALL SORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + $ WORK, IINFO ) + END DO +* +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary +* + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * -* Set columns 1:i-1 of current block to zero +* Apply H**T to A(i+ib:m,i:n) from the right * - DO 40 J = 1, I - 1 - DO 30 L = I, I + IB - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL SLARFB0C2(.FALSE., 'Right', 'Transpose', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + + CALL SORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK, IINFO ) + END IF END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) diff --git a/SRC/sorgql.f b/SRC/sorgql.f index 7257ff2d95..882e7d4889 100644 --- a/SRC/sorgql.f +++ b/SRC/sorgql.f @@ -143,11 +143,10 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT, - $ NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, J, KK, LWKOPT, NB, NBMIN * .. * .. External Subroutines .. - EXTERNAL SLARFB, SLARFT, SORG2L, XERBLA + EXTERNAL SLARFB0C2, SLARFT, SORG2L, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -178,7 +177,8 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) LWKOPT = 1 ELSE NB = ILAENV( 1, 'SORGQL', ' ', M, N, K, -1 ) - LWKOPT = N*NB + ! Only need a workspace for calls to dorg2l + LWKOPT = N END IF WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * @@ -201,88 +201,76 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 + NX = MAX(0, ILAENV(3, 'SORGQL', ' ', M, N, K, -1)) IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. -* - NX = MAX( 0, ILAENV( 3, 'SORGQL', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* We want to use the blocking method as long as our matrix is big enough * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KK = K + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Possibly bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'SORGQL', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL SORG2L( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the first block. -* The last kk columns are handled by the block method. +* Factor the first block assuming that our first application +* will be on the Identity matrix * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) + I = 1 + IB = NB * -* Set A(m-kk+1:m,1:n-kk) to zero. +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - DO 20 J = 1, N - KK - DO 10 I = M - KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA) * -* Use unblocked code for the first or only block. +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left * - CALL SORG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + CALL SLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', + $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), + $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * - IF( KK.GT.0 ) THEN +* Apply H to rows 1:m-k+i+ib-1 of current block * -* Use blocked code + CALL SORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, + $ TAU( I ), WORK, IINFO ) + +* Use blocked code on the remaining blocks if there are any. * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - IF( N-K+I.GT.1 ) THEN + DO I = NB+1, K, NB * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* The last block may be less than size NB * - CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, - $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK ) + IB = MIN(NB, K-I+1) * -* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL SLARFB( 'Left', 'No transpose', 'Backward', - $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, - $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA, - $ WORK( IB+1 ), LDWORK ) - END IF + CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA ) +* +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* + CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, + $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * * Apply H to rows 1:m-k+i+ib-1 of current block * CALL SORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, $ TAU( I ), WORK, IINFO ) -* -* Set rows m-k+i+ib:m of current block to zero -* - DO 40 J = N - K + I, N - K + I + IB - 1 - DO 30 L = M - K + I + IB, M - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + END DO END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) diff --git a/SRC/sorgqr.f b/SRC/sorgqr.f index 47d1fd248c..65d397de36 100644 --- a/SRC/sorgqr.f +++ b/SRC/sorgqr.f @@ -147,7 +147,7 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL SLARFB, SLARFT, SORG2R, XERBLA + EXTERNAL SLARFB0C2, SLARFT, SORG2R, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -163,7 +163,7 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'SORGQR', ' ', M, N, K, -1 ) - LWKOPT = MAX( 1, N )*NB + LWKOPT = MAX( 1, N ) WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -194,92 +194,92 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) NBMIN = 2 NX = 0 IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. + IF( NB.GE.NBMIN .AND. NB.LT.K ) THEN * - NX = MAX( 0, ILAENV( 3, 'SORGQR', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN +* Handle the first block assuming we are applying to the +* identity, then resume regular blocking method after * -* Determine if workspace is large enough for blocked code. -* - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KI = K - 2 * NB + KK = K - NB + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Potentially bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'SORGQR', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL SORG2R( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN + I = KK + 1 + IB = NB * -* Use blocked code after the last block. -* The first kk columns are handled by the block method. +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) + CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * -* Set A(1:kk,kk+1:n) to zero. +* Apply H to A(i:m,i+ib:n) from the left * - DO 20 J = KK + 1, N - DO 10 I = 1, KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL SLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward', + $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), + $ LDA, A(I,I+IB), LDA) * -* Use unblocked code for the last or only block. +* Apply H to rows i:m of current block * - IF( KK.LT.N ) - $ CALL SORG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + CALL SORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO) + DO I = KI + 1, 1, -NB + IB = NB * - IF( KK.GT.0 ) THEN +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Use blocked code + CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.N ) THEN +* Apply H to A(i:m,i+ib:n) from the left * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) + CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + * - CALL SLARFT( 'Forward', 'Columnwise', M-I+1, IB, - $ A( I, I ), LDA, TAU( I ), WORK, LDWORK ) +* Apply H to rows i:m of current block * -* Apply H to A(i:m,i+ib:n) from the left + CALL SORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + $ IINFO) + END DO * - CALL SLARFB( 'Left', 'No transpose', 'Forward', - $ 'Columnwise', M-I+1, N-I-IB+1, IB, - $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ), - $ LDA, WORK( IB+1 ), LDWORK ) - END IF +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary * -* Apply H to rows i:m of current block + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 * - CALL SORG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Set rows 1:i-1 of current block to zero + CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * - DO 40 J = I, I + IB - 1 - DO 30 L = 1, I - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE - END IF +* Apply H to A(i:m,i+ib:n) from the left +* + CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + * +* Apply H to rows i:m of current block +* + CALL SORG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + $ IINFO) + END IF + END IF WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * diff --git a/SRC/sorgrq.f b/SRC/sorgrq.f index 5fe685c1d8..ea1a4e73da 100644 --- a/SRC/sorgrq.f +++ b/SRC/sorgrq.f @@ -147,7 +147,7 @@ SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL SLARFB, SLARFT, SORGR2, XERBLA + EXTERNAL SLARFB0C2, SLARFT, SORGR2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -228,63 +228,70 @@ SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Use blocked code after the first block. -* The last kk rows are handled by the block method. +* We want to use the blocking method as long as our matrix is big enough +* and it's deemed worthwhile with the extra memory allocations * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) -* -* Set A(1:m-kk,n-kk+1:n) to zero. -* - DO 20 J = N - KK + 1, N - DO 10 I = 1, M - KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KK = K ELSE KK = 0 END IF * -* Use unblocked code for the first or only block. +* Potentially bail to the unblocked code * - CALL SORGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL SORGR2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN * -* Use blocked code +* Factor the first block assuming that our first application +* will be on the Identity matrix * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - II = M - K + I - IF( II.GT.1 ) THEN + I = 1 + IB = NB + II = M - K + I * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL SLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, - $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) + CALL SLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) * -* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * - CALL SLARFB( 'Right', 'Transpose', 'Backward', - $ 'Rowwise', - $ II-1, N-K+I+IB-1, IB, A( II, 1 ), LDA, WORK, - $ LDWORK, A, LDA, WORK( IB+1 ), LDWORK ) - END IF + CALL SLARFB0C2(.TRUE., 'Right', 'Transpose', 'Backward', + $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK, + $ LDWORK, A, LDA) * * Apply H**T to columns 1:n-k+i+ib-1 of current block * - CALL SORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, - $ TAU( I ), - $ WORK, IINFO ) + CALL SORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, + $ TAU( I ), WORK, IINFO ) + + DO I = NB + 1, K, NB * -* Set columns n-k+i+ib:n of current block to zero +* The last block may be less than size NB +* + IB = MIN(NB, K-I+1) + II = M - K + I * - DO 40 L = N - K + I + IB, N - DO 30 J = II, II + IB - 1 - A( J, L ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) +* + CALL SLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) +* +* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* + CALL SLARFB0C2(.FALSE., 'Right', 'Transpose', + $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, + $ A(II,1), LDA, WORK, LDWORK, A, LDA) +* +* Apply H**T to columns 1:n-k+i+ib-1 of current block +* + CALL SORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, + $ TAU( I ), WORK, IINFO ) + END DO END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) From 338bae27fcb4bf507ceaeeb1387f0f72705213ca Mon Sep 17 00:00:00 2001 From: Johnathan Rhyne Date: Tue, 11 Feb 2025 16:32:05 -0700 Subject: [PATCH 03/10] double complex tests pass locally --- SRC/CMakeLists.txt | 2 +- SRC/Makefile | 2 +- SRC/dorglq.f | 8 +- SRC/dorgql.f | 6 +- SRC/dorgqr.f | 6 +- SRC/dorgrq.f | 6 +- SRC/sorgql.f | 3 +- SRC/zlarfb0c2.f | 381 +++++++++++++++++++++++++++++++++++++++++++++ SRC/zunglq.f | 124 +++++++++------ SRC/zungql.f | 121 +++++++------- SRC/zungqr.f | 148 ++++++++++-------- SRC/zungrq.f | 90 +++++------ 12 files changed, 644 insertions(+), 253 deletions(-) create mode 100644 SRC/zlarfb0c2.f diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 12cfebca5f..16156b72a2 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -418,7 +418,7 @@ set(ZLASRC zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqp2rk.f zlaqp3rk.f zlaqsb.f zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f - zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f zlarf1f.f zlarf1l.f + zlarcm.f zlarf.f zlarfb.f zlarfb0c2.f zlarfb_gett.f zlarf1f.f zlarf1l.f zlarfg.f zlarfgp.f zlarft.f zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f diff --git a/SRC/Makefile b/SRC/Makefile index 5fb5db20a6..44ba06be9d 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -453,7 +453,7 @@ ZLASRC = \ zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqp2rk.o zlaqp3rk.o zlaqsb.o \ zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \ zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \ - zlarcm.o zlarf.o zlarfb.o zlarfb_gett.o zlarf1f.o zlarf1l.o \ + zlarcm.o zlarf.o zlarfb.o zlarfb0c2.o zlarfb_gett.o zlarf1f.o zlarf1l.o \ zlarfg.o zlarft.o zlarfgp.o \ zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \ zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \ diff --git a/SRC/dorglq.f b/SRC/dorglq.f index 6507f625b3..c7f32afb8f 100644 --- a/SRC/dorglq.f +++ b/SRC/dorglq.f @@ -138,14 +138,10 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - DOUBLE PRECISION ZERO - PARAMETER ( ZERO = 0.0D+0 ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KI, KK, LDWORK, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL DLARFB0C2, DLARFT, DORGL2, XERBLA diff --git a/SRC/dorgql.f b/SRC/dorgql.f index 9f34c858f9..13146142ed 100644 --- a/SRC/dorgql.f +++ b/SRC/dorgql.f @@ -139,13 +139,9 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - DOUBLE PRECISION ZERO - PARAMETER ( ZERO = 0.0D+0 ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KK, LWKOPT, NB, NBMIN + INTEGER I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN * .. * .. External Subroutines .. EXTERNAL DLARFB0C2, DLARFT, DORG2L, XERBLA diff --git a/SRC/dorgqr.f b/SRC/dorgqr.f index 0ed278ffc0..4b323f68fb 100644 --- a/SRC/dorgqr.f +++ b/SRC/dorgqr.f @@ -139,13 +139,9 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - DOUBLE PRECISION ZERO - PARAMETER ( ZERO = 0.0D+0 ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, LWKOPT, + INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, $ NB, NBMIN, NX * .. * .. External Subroutines .. diff --git a/SRC/dorgrq.f b/SRC/dorgrq.f index ce61238776..811c2cdb5e 100644 --- a/SRC/dorgrq.f +++ b/SRC/dorgrq.f @@ -139,13 +139,9 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - DOUBLE PRECISION ZERO - PARAMETER ( ZERO = 0.0D+0 ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, II, IINFO, IWS, J, KK, L, LDWORK, + INTEGER I, IB, II, IINFO, IWS, KK, LDWORK, $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. diff --git a/SRC/sorgql.f b/SRC/sorgql.f index 882e7d4889..69484d2c36 100644 --- a/SRC/sorgql.f +++ b/SRC/sorgql.f @@ -143,7 +143,8 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KK, LWKOPT, NB, NBMIN + INTEGER I, IB, IINFO, IWS, J, KK, LWKOPT, NB, + $ NBMIN, NX * .. * .. External Subroutines .. EXTERNAL SLARFB0C2, SLARFT, SORG2L, XERBLA diff --git a/SRC/zlarfb0c2.f b/SRC/zlarfb0c2.f new file mode 100644 index 0000000000..3bbf11e268 --- /dev/null +++ b/SRC/zlarfb0c2.f @@ -0,0 +1,381 @@ + SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, + $ K, V, LDV, T, LDT, C, LDC) + ! Scalar arguments + INTEGER M, N, K, LDV, LDC, LDT + CHARACTER SIDE, TRANS, DIRECT, STOREV + ! True means that we are assuming C2 is the identity matrix + ! and thus don't reference whatever is present in C2 + ! at the beginning. + LOGICAL C2I + + ! Array arguments + COMPLEX*16 V(LDV,*), C(LDC,*), T(LDT,*) + ! Local scalars + LOGICAL QR, LQ, QL, DIRF, COLV, SIDEL, SIDER, + $ TRANST + INTEGER I, J + ! Intrinsic Functions + INTRINSIC CONJG + ! External functions + LOGICAL LSAME + EXTERNAL LSAME + ! External Subroutines + EXTERNAL ZGEMM, ZTRMM + ! Parameters + COMPLEX*16 ONE, ZERO, NEG_ONE + PARAMETER(ONE=(1.0D+0, 0.0D+0), + $ ZERO = (0.0D+0, 0.0D+0), + $ NEG_ONE = (-1.0D+0, 0.0D+0)) + + ! Beginning of executable statements + ! Convert our character flags to logical values + DIRF = LSAME(DIRECT,'F') + COLV = LSAME(STOREV,'C') + SIDEL = LSAME(SIDE,'L') + SIDER = LSAME(SIDE,'R') + TRANST = LSAME(TRANS,'C') + + ! Determine which of the 4 modes are using. + ! QR is when we store the reflectors column by column and have the + ! 'first' reflector stored in the first column + QR = DIRF.AND.COLV + + ! LQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the first row + LQ = DIRF.AND.(.NOT.COLV) + + ! QL is when we store the reflectors column by column and have the + ! 'first' reflector stored in the last column + QL = (.NOT.DIRF).AND.COLV + + ! RQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the last row + ! RQ = (.NOT.DIRF).AND.(.NOT.COLV) + ! Since we have exactly one of these 4 modes, we don't need to actually + ! store the value of RQ, instead we assume this is the case if we fail + ! the above 3 checks. + + IF (QR) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V1 ] and C = [ C1 ] + ! [ V2 ] [ C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ] + ! [ V2 ] [ C2 ] + ! = [ 0 ] - [ V1 ]T*V2'*C2 + ! [ C2 ] [ V2 ] + ! + ! = [ 0 ] - [ V1*T*V2'*C2 ] + ! [ C2 ] [ V2*T*V2'*C2 ] + ! + ! = [ V1*T*V2'*C2 ] + ! [ C2 - V2*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF (C2I) THEN + DO J = 1, N + DO I = 1, K + C(I,J) = CONJG(V(K+J,I)) + END DO + END DO + ELSE + CALL ZGEMM('Conjugate', 'No Transpose', K, N, M - K, + $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, + $ C, LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF (C2I) THEN + CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO, + $ C(K+1,1), LDC) + DO I = 1, N + C(K+I,I) = C(K+I,I) + ONE + END DO + ELSE + CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, + $ C(K+1,1), LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V, LDV, C, LDC) + ELSE IF (LQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V1 V2 ] and C = [ C1 C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ]) + ! [ V2' ] + ! + ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ] + ! [ V2' ] + ! + ! = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ] + ! + ! = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(.NOT.TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,J) = CONJG(V(J,K+I)) + END DO + END DO + ELSE + CALL ZGEMM('No Transpose', 'Conjugate', M, K, N-K, + $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, + $ LDC) + END IF + ! + ! C1 = C1*T' + ! + CALL ZTRMM('Right', 'Upper', 'Conjugate', 'Non-unit', + $ M, K, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1), + $ LDC) + DO I = 1, M + C(I,K+I) = C(I,K+I) + ONE + END DO + ELSE + CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1), + $ LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL ZTRMM('Right', 'Upper', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V, LDV, C, LDC) + ELSE IF (QL) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V2 ] and C = [ C2 ] + ! [ V1 ] [ C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ] + ! [ V1 ] [ 0 ] + ! + ! = [ C2 ] - [ V2 ]T*V2'*C2 + ! [ 0 ] [ V1 ] + ! + ! = [ C2 ] - [ V2*T*V2'*C2 ] + ! [ 0 ] [ V1*T*V2'*C2 ] + ! + ! = [ C2 - V2*T*V2'*C2 ] + ! [ - V1*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF( C2I ) THEN + DO J = 1, N + DO I = 1, K + C(M-K+I,J) = CONJG(V(J,I)) + END DO + END DO + ELSE + CALL ZGEMM('Conjugate', 'No Transpose', K, N, M-K, + $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C(M-K+1,1), LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF( C2I ) THEN + CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC) + DO I = 1, N + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC) + ELSE ! IF (RQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V2 V1] and C = [ C2 C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ] + ! + ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] + ! + ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(.NOT.TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,N-K+J) = CONJG(V(J,I)) + END DO + END DO + ELSE + CALL ZGEMM('No Transpose', 'Conjugate', M, K, N-K, + $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC) + END IF + ! + ! C1 = C1*T' + ! + CALL ZTRMM('Right', 'Lower', 'Conjugate', 'Non-unit', + $ M, K, ONE, T, LDT, C(1, N-K+1), LDC) + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC) + DO I = 1, M + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL ZTRMM('Right', 'Lower', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC) + END IF + END SUBROUTINE diff --git a/SRC/zunglq.f b/SRC/zunglq.f index 3cc107560d..9da595f491 100644 --- a/SRC/zunglq.f +++ b/SRC/zunglq.f @@ -121,7 +121,7 @@ *> \ingroup unglq * * ===================================================================== - SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) + SUBROUTINE ZUNGLQ(M, N, K, A, LDA, TAU, WORK, LWORK, INFO) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -136,17 +136,13 @@ SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX*16 ZERO - PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, LDWORK, + $ NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL XERBLA, ZLARFB, ZLARFT, ZUNGL2 + EXTERNAL XERBLA, ZLARFB0C2, ZLARFT, ZUNGL2 * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -217,67 +213,95 @@ SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Use blocked code after the last block. -* The first kk rows are handled by the block method. -* - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) -* -* Set A(kk+1:m,1:kk) to zero. +* Treat the last NB block starting at KK+1 specially then use our blocking +* method from the block starting at KI+1 to 1 * - DO 20 J = 1, KK - DO 10 I = KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KI = K - 2 * NB + KK = K - NB ELSE KK = 0 END IF * -* Use unblocked code for the last or only block. +* Potentially bail to the unblocked version * - IF( KK.LT.M ) - $ CALL ZUNGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL ZUNGL2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN * -* Use blocked code +* Factor the last block assuming that our first application +* will be on the Identity matrix * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.M ) THEN + I = KK + 1 + IB = NB * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, - $ I ), - $ LDA, TAU( I ), WORK, LDWORK ) + CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * -* Apply H**H to A(i+ib:m,i:n) from the right +* Apply H**T to A(i+ib:m,i:n) from the right +* Exploit the fact that we are applying to an identity * - CALL ZLARFB( 'Right', 'Conjugate transpose', - $ 'Forward', - $ 'Rowwise', M-I-IB+1, N-I+1, IB, A( I, I ), - $ LDA, WORK, LDWORK, A( I+IB, I ), LDA, - $ WORK( IB+1 ), LDWORK ) - END IF + CALL ZLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + + CALL ZUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK, IINFO ) +* +* Use our standard blocking method after the last block +* + DO I = KI + 1, 1, -NB + IB = NB +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Apply H**H to columns i:n of current block + CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * +* Apply H**T to A(i+ib:m,i:n) from the right +* + CALL ZLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + CALL ZUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + $ WORK, IINFO ) + END DO +* +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary +* + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Set columns 1:i-1 of current block to zero + CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * - DO 40 J = 1, I - 1 - DO 30 L = I, I + IB - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE +* Apply H**T to A(i+ib:m,i:n) from the right +* + CALL ZLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + + CALL ZUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK, IINFO ) + END IF END IF * WORK( 1 ) = IWS diff --git a/SRC/zungql.f b/SRC/zungql.f index a64de501d3..e3e604e791 100644 --- a/SRC/zungql.f +++ b/SRC/zungql.f @@ -122,7 +122,7 @@ *> \ingroup ungql * * ===================================================================== - SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) + SUBROUTINE ZUNGQL(M, N, K, A, LDA, TAU, WORK, LWORK, INFO) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -136,18 +136,14 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * .. * * ===================================================================== -* -* .. Parameters .. - COMPLEX*16 ZERO - PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) -* .. +* * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT, - $ NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN, + $ NX * .. * .. External Subroutines .. - EXTERNAL XERBLA, ZLARFB, ZLARFT, ZUNG2L + EXTERNAL XERBLA, ZLARFB0C2, ZLARFT, ZUNG2L * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -177,7 +173,11 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) LWKOPT = 1 ELSE NB = ILAENV( 1, 'ZUNGQL', ' ', M, N, K, -1 ) - LWKOPT = N*NB +* +* Only need a workspace for zung2l in case of bailout +* and for the panel factorization +* + LWKOPT = N END IF WORK( 1 ) = LWKOPT * @@ -200,88 +200,77 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 + NX = MAX( 0, ILAENV( 3, 'ZUNGQL', ' ', M, N, K, -1 )) IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN -* -* Determine when to cross over from blocked to unblocked code. * - NX = MAX( 0, ILAENV( 3, 'ZUNGQL', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* We use blocked code for the entire construction * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KK = K + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Possibly bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'ZUNGQL', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL ZUNG2L( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the first block. -* The last kk columns are handled by the block method. +* Factor the first block assuming that our first application +* will be on the Identity matrix * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) + I = 1 + IB = NB * -* Set A(m-kk+1:m,1:n-kk) to zero. +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - DO 20 J = 1, N - KK - DO 10 I = M - KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA) * -* Use unblocked code for the first or only block. +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* Exploit the fact that we are applying to an identity * - CALL ZUNG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + CALL ZLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', + $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), + $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * - IF( KK.GT.0 ) THEN +* Apply H to rows 1:m-k+i+ib-1 of current block +* + CALL ZUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, + $ TAU( I ), WORK, IINFO ) + +* Use blocked code on the remaining blocks if there are any. +* + DO I = NB+1, K, NB * -* Use blocked code +* The last block may be less than size NB * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - IF( N-K+I.GT.1 ) THEN + IB = MIN(NB, K-I+1) * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, - $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK ) + CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA ) * -* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left * - CALL ZLARFB( 'Left', 'No transpose', 'Backward', - $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, - $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA, - $ WORK( IB+1 ), LDWORK ) - END IF + CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, + $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * * Apply H to rows 1:m-k+i+ib-1 of current block * CALL ZUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, $ TAU( I ), WORK, IINFO ) -* -* Set rows m-k+i+ib:m of current block to zero -* - DO 40 J = N - K + I, N - K + I + IB - 1 - DO 30 L = M - K + I + IB, M - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + END DO END IF * WORK( 1 ) = IWS diff --git a/SRC/zungqr.f b/SRC/zungqr.f index 6c9f2e3ff5..bb5dd9ee03 100644 --- a/SRC/zungqr.f +++ b/SRC/zungqr.f @@ -122,7 +122,7 @@ *> \ingroup ungqr * * ===================================================================== - SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) + SUBROUTINE ZUNGQR(M, N, K, A, LDA, TAU, WORK, LWORK, INFO) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -137,17 +137,13 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX*16 ZERO - PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, NB, + $ NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL XERBLA, ZLARFB, ZLARFT, ZUNG2R + EXTERNAL XERBLA, ZLARFB0C2, ZLARFT, ZUNG2R * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -162,7 +158,11 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'ZUNGQR', ' ', M, N, K, -1 ) - LWKOPT = MAX( 1, N )*NB +* +* Only need a workspace for zung2r in case of bailout +* and for the panel factorization +* + LWKOPT = MAX( 1, N ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -191,92 +191,102 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 +* Determine when to cross over from unblocked to blocked + NX = MAX( 0, ILAENV( 3, 'ZUNGQR', ' ', M, N, K, -1 ) ) IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. -* - NX = MAX( 0, ILAENV( 3, 'ZUNGQR', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* Treat the last NB block starting at KK+1 specially then use our blocking +* method from the block starting at KI+1 to 1 * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KI = K - 2 * NB + KK = K - NB + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Potentially bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'ZUNGQR', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL ZUNG2R( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the last block. -* The first kk columns are handled by the block method. +* Factor the last block assuming that our first application +* will be on the Identity matrix * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) + I = KK + 1 + IB = NB * -* Set A(1:kk,kk+1:n) to zero. +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 20 J = KK + 1, N - DO 10 I = 1, KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * -* Use unblocked code for the last or only block. +* Apply H to A(i:m,i+ib:n) from the left +* Exploit the fact that we are applying to an identity * - IF( KK.LT.N ) - $ CALL ZUNG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + CALL ZLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward', + $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), + $ LDA, A(I,I+IB), LDA) * - IF( KK.GT.0 ) THEN +* Apply H to rows i:m of current block * -* Use blocked code + CALL ZUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO) * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.N ) THEN +* Use our standard blocking method after the last block * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) + DO I = KI + 1, 1, -NB + IB = NB * - CALL ZLARFT( 'Forward', 'Columnwise', M-I+1, IB, - $ A( I, I ), LDA, TAU( I ), WORK, LDWORK ) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Apply H to A(i:m,i+ib:n) from the left + CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * - CALL ZLARFB( 'Left', 'No transpose', 'Forward', - $ 'Columnwise', M-I+1, N-I-IB+1, IB, - $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ), - $ LDA, WORK( IB+1 ), LDWORK ) - END IF +* Apply H to A(i:m,i+ib:n) from the left +* + CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + * * Apply H to rows i:m of current block * - CALL ZUNG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + CALL ZUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + $ IINFO) + END DO +* +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary +* + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 * -* Set rows 1:i-1 of current block to zero +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 40 J = I, I + IB - 1 - DO 30 L = 1, I - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) +* +* Apply H to A(i:m,i+ib:n) from the left +* + CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + +* +* Apply H to rows i:m of current block +* + CALL ZUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + $ IINFO) + END IF END IF * WORK( 1 ) = IWS diff --git a/SRC/zungrq.f b/SRC/zungrq.f index dcc2772d67..1a7640d102 100644 --- a/SRC/zungrq.f +++ b/SRC/zungrq.f @@ -122,7 +122,7 @@ *> \ingroup ungrq * * ===================================================================== - SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) + SUBROUTINE ZUNGRQ(M, N, K, A, LDA, TAU, WORK, LWORK, INFO) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -137,13 +137,9 @@ SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX*16 ZERO - PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, II, IINFO, IWS, J, KK, L, LDWORK, + INTEGER I, IB, II, IINFO, IWS, KK, LDWORK, $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. @@ -230,61 +226,67 @@ SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * Use blocked code after the first block. * The last kk rows are handled by the block method. * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) -* -* Set A(1:m-kk,n-kk+1:n) to zero. -* - DO 20 J = N - KK + 1, N - DO 10 I = 1, M - KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KK = K ELSE KK = 0 END IF * -* Use unblocked code for the first or only block. +* Potentially bail to the unblocked code * - CALL ZUNGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL ZUNGR2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN * -* Use blocked code +* Factor the first block assuming that our first application +* will be on the Identity matrix * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - II = M - K + I - IF( II.GT.1 ) THEN + I = 1 + IB = NB + II = M - K + I * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL ZLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, - $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) + CALL ZLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) * -* Apply H**H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * - CALL ZLARFB( 'Right', 'Conjugate transpose', - $ 'Backward', - $ 'Rowwise', II-1, N-K+I+IB-1, IB, A( II, 1 ), - $ LDA, WORK, LDWORK, A, LDA, WORK( IB+1 ), - $ LDWORK ) - END IF + CALL ZLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Backward', + $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK, + $ LDWORK, A, LDA) * -* Apply H**H to columns 1:n-k+i+ib-1 of current block +* Apply H**T to columns 1:n-k+i+ib-1 of current block * - CALL ZUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, - $ TAU( I ), - $ WORK, IINFO ) + CALL ZUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, + $ TAU( I ), WORK, IINFO ) + + DO I = NB+1, K, NB * -* Set columns n-k+i+ib:n of current block to zero +* The last block may be less than size NB * - DO 40 L = N - K + I + IB, N - DO 30 J = II, II + IB - 1 - A( J, L ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + IB = MIN(NB, K-I+1) + II = M - K + I +* +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) +* + CALL ZLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) +* +* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* + CALL ZLARFB0C2(.FALSE., 'Right', 'Conjugate', + $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, + $ A(II,1), LDA, WORK, LDWORK, A, LDA) +* +* Apply H**T to columns 1:n-k+i+ib-1 of current block +* + CALL ZUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, + $ TAU( I ), WORK, IINFO ) + END DO END IF * WORK( 1 ) = IWS From 2b21ac7bceadaeb7f6022d0c00d38aedd74068d6 Mon Sep 17 00:00:00 2001 From: Johnathan Rhyne Date: Tue, 11 Feb 2025 17:05:34 -0700 Subject: [PATCH 04/10] single complex implementation also pass local tests. TODO: check github tests --- SRC/CMakeLists.txt | 2 +- SRC/Makefile | 2 +- SRC/clarfb0c2.f | 381 +++++++++++++++++++++++++++++++++++++++++++++ SRC/cunglq.f | 122 +++++++++------ SRC/cungql.f | 117 +++++++------- SRC/cungqr.f | 146 +++++++++-------- SRC/cungrq.f | 90 +++++------ SRC/slarfb0c2.f | 16 +- SRC/zlarfb0c2.f | 16 +- 9 files changed, 649 insertions(+), 243 deletions(-) create mode 100644 SRC/clarfb0c2.f diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 16156b72a2..24740835b8 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -218,7 +218,7 @@ set(CLASRC claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqp2rk.f claqp3rk.f claqsb.f claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f - clarf.f clarf1f.f clarf1l.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f + clarf.f clarf1f.f clarf1l.f clarfb.f clarfb0c2.f clarfb_gett.f clarfg.f clarfgp.f clarft.f clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f90 claswp.f clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f diff --git a/SRC/Makefile b/SRC/Makefile index 44ba06be9d..b9eaf1bca3 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -249,7 +249,7 @@ CLASRC = \ claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqp2rk.o claqp3rk.o claqsb.o \ claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \ claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \ - clarf.o clarf1f.o clarf1l.o clarfb.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \ + clarf.o clarf1f.o clarf1l.o clarfb.o clarfb0c2.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \ clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \ clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \ claswp.o clasyf.o clasyf_rook.o clasyf_rk.o clasyf_aa.o \ diff --git a/SRC/clarfb0c2.f b/SRC/clarfb0c2.f new file mode 100644 index 0000000000..4ab8f47aea --- /dev/null +++ b/SRC/clarfb0c2.f @@ -0,0 +1,381 @@ + SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, + $ K, V, LDV, T, LDT, C, LDC) + ! Scalar arguments + INTEGER M, N, K, LDV, LDC, LDT + CHARACTER SIDE, TRANS, DIRECT, STOREV + ! True means that we are assuming C2 is the identity matrix + ! and thus don't reference whatever is present in C2 + ! at the beginning. + LOGICAL C2I + + ! Array arguments + COMPLEX*8 V(LDV,*), C(LDC,*), T(LDT,*) + ! Local scalars + LOGICAL QR, LQ, QL, DIRF, COLV, SIDEL, SIDER, + $ TRANST + INTEGER I, J + ! Intrinsic Functions + INTRINSIC CONJG + ! External functions + LOGICAL LSAME + EXTERNAL LSAME + ! External Subroutines + EXTERNAL CGEMM, CTRMM + ! Parameters + COMPLEX*8 ONE, ZERO, NEG_ONE + PARAMETER(ONE=(1.0E+0, 0.0E+0), + $ ZERO = (0.0E+0, 0.0E+0), + $ NEG_ONE = (-1.0E+0, 0.0E+0)) + + ! Beginning of executable statements + ! Convert our character flags to logical values + DIRF = LSAME(DIRECT,'F') + COLV = LSAME(STOREV,'C') + SIDEL = LSAME(SIDE,'L') + SIDER = LSAME(SIDE,'R') + TRANST = LSAME(TRANS,'C') + + ! Determine which of the 4 modes are using. + ! QR is when we store the reflectors column by column and have the + ! 'first' reflector stored in the first column + QR = DIRF.AND.COLV + + ! LQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the first row + LQ = DIRF.AND.(.NOT.COLV) + + ! QL is when we store the reflectors column by column and have the + ! 'first' reflector stored in the last column + QL = (.NOT.DIRF).AND.COLV + + ! RQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the last row + ! RQ = (.NOT.DIRF).AND.(.NOT.COLV) + ! Since we have exactly one of these 4 modes, we don't need to actually + ! store the value of RQ, instead we assume this is the case if we fail + ! the above 3 checks. + + IF (QR) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V1 ] and C = [ C1 ] + ! [ V2 ] [ C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ] + ! [ V2 ] [ C2 ] + ! = [ 0 ] - [ V1 ]T*V2'*C2 + ! [ C2 ] [ V2 ] + ! + ! = [ 0 ] - [ V1*T*V2'*C2 ] + ! [ C2 ] [ V2*T*V2'*C2 ] + ! + ! = [ V1*T*V2'*C2 ] + ! [ C2 - V2*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('CLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('CLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF (C2I) THEN + DO J = 1, N + DO I = 1, K + C(I,J) = CONJG(V(K+J,I)) + END DO + END DO + ELSE + CALL CGEMM('Conjugate', 'No Transpose', K, N, M - K, + $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, + $ C, LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL CTRMM('Left', 'Upper', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF (C2I) THEN + CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO, + $ C(K+1,1), LDC) + DO I = 1, N + C(K+I,I) = C(K+I,I) + ONE + END DO + ELSE + CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, + $ C(K+1,1), LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL CTRMM('Left', 'Lower', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V, LDV, C, LDC) + ELSE IF (LQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V1 V2 ] and C = [ C1 C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ]) + ! [ V2' ] + ! + ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ] + ! [ V2' ] + ! + ! = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ] + ! + ! = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('CLARFB0C2', 2) + RETURN + ELSE IF(.NOT.TRANST) THEN + CALL XERBLA('CLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,J) = CONJG(V(J,K+I)) + END DO + END DO + ELSE + CALL CGEMM('No Transpose', 'Conjugate', M, K, N-K, + $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, + $ LDC) + END IF + ! + ! C1 = C1*T' + ! + CALL CTRMM('Right', 'Upper', 'Conjugate', 'Non-unit', + $ M, K, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1), + $ LDC) + DO I = 1, M + C(I,K+I) = C(I,K+I) + ONE + END DO + ELSE + CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1), + $ LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL CTRMM('Right', 'Upper', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V, LDV, C, LDC) + ELSE IF (QL) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V2 ] and C = [ C2 ] + ! [ V1 ] [ C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ] + ! [ V1 ] [ 0 ] + ! + ! = [ C2 ] - [ V2 ]T*V2'*C2 + ! [ 0 ] [ V1 ] + ! + ! = [ C2 ] - [ V2*T*V2'*C2 ] + ! [ 0 ] [ V1*T*V2'*C2 ] + ! + ! = [ C2 - V2*T*V2'*C2 ] + ! [ - V1*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('CLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('CLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF( C2I ) THEN + DO J = 1, N + DO I = 1, K + C(M-K+I,J) = CONJG(V(J,I)) + END DO + END DO + ELSE + CALL CGEMM('Conjugate', 'No Transpose', K, N, M-K, + $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL CTRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C(M-K+1,1), LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF( C2I ) THEN + CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC) + DO I = 1, N + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL CTRMM('Left', 'Upper', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC) + ELSE ! IF (RQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V2 V1] and C = [ C2 C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ] + ! + ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] + ! + ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('CLARFB0C2', 2) + RETURN + ELSE IF(.NOT.TRANST) THEN + CALL XERBLA('CLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,N-K+J) = CONJG(V(J,I)) + END DO + END DO + ELSE + CALL CGEMM('No Transpose', 'Conjugate', M, K, N-K, + $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC) + END IF + ! + ! C1 = C1*T' + ! + CALL CTRMM('Right', 'Lower', 'Conjugate', 'Non-unit', + $ M, K, ONE, T, LDT, C(1, N-K+1), LDC) + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC) + DO I = 1, M + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL CTRMM('Right', 'Lower', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC) + END IF + END SUBROUTINE diff --git a/SRC/cunglq.f b/SRC/cunglq.f index 7f42c3dcb0..a57f5a037d 100644 --- a/SRC/cunglq.f +++ b/SRC/cunglq.f @@ -136,17 +136,13 @@ SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX ZERO - PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, LDWORK, + $ NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL CLARFB, CLARFT, CUNGL2, XERBLA + EXTERNAL CLARFB0C2, CLARFT, CUNGL2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -218,67 +214,95 @@ SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Use blocked code after the last block. -* The first kk rows are handled by the block method. -* - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) -* -* Set A(kk+1:m,1:kk) to zero. +* Treat the last NB block starting at KK+1 specially then use our blocking +* method from the block starting at KI+1 to 1 * - DO 20 J = 1, KK - DO 10 I = KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KI = K - 2 * NB + KK = K - NB ELSE KK = 0 END IF * -* Use unblocked code for the last or only block. +* Potentially bail to the unblocked version * - IF( KK.LT.M ) - $ CALL CUNGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL CUNGL2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN * -* Use blocked code +* Factor the last block assuming that our first application +* will be on the Identity matrix * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.M ) THEN + I = KK + 1 + IB = NB * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, - $ I ), - $ LDA, TAU( I ), WORK, LDWORK ) + CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * -* Apply H**H to A(i+ib:m,i:n) from the right +* Apply H**T to A(i+ib:m,i:n) from the right +* Exploit the fact that we are applying to an identity * - CALL CLARFB( 'Right', 'Conjugate transpose', - $ 'Forward', - $ 'Rowwise', M-I-IB+1, N-I+1, IB, A( I, I ), - $ LDA, WORK, LDWORK, A( I+IB, I ), LDA, - $ WORK( IB+1 ), LDWORK ) - END IF + CALL CLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + + CALL CUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK, IINFO ) +* +* Use our standard blocking method after the last block +* + DO I = KI + 1, 1, -NB + IB = NB +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Apply H**H to columns i:n of current block + CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * +* Apply H**T to A(i+ib:m,i:n) from the right +* + CALL CLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + CALL CUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + $ WORK, IINFO ) + END DO +* +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary +* + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Set columns 1:i-1 of current block to zero + CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), WORK, LDWORK ) * - DO 40 J = 1, I - 1 - DO 30 L = I, I + IB - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE +* Apply H**T to A(i+ib:m,i:n) from the right +* + CALL CLARFB0C2(.FALSE., 'Right', 'Conjugate', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, WORK, + $ LDWORK, A(I+IB,I), LDA) +* +* Apply H**T to columns i:n of current block + + CALL CUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), + $ WORK, IINFO ) + END IF END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) diff --git a/SRC/cungql.f b/SRC/cungql.f index 3da2702d8e..893303b4e1 100644 --- a/SRC/cungql.f +++ b/SRC/cungql.f @@ -137,17 +137,13 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX ZERO - PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT, - $ NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN, + $ NX * .. * .. External Subroutines .. - EXTERNAL CLARFB, CLARFT, CUNG2L, XERBLA + EXTERNAL CLARFB0C2, CLARFT, CUNG2L, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -178,7 +174,11 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) LWKOPT = 1 ELSE NB = ILAENV( 1, 'CUNGQL', ' ', M, N, K, -1 ) - LWKOPT = N*NB +* +* Only need a workspace for cung2l in case of bailout +* and for the panel factorization +* + LWKOPT = N END IF WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * @@ -201,88 +201,77 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 + NX = MAX( 0, ILAENV( 3, 'CUNGQL', ' ', M, N, K, -1 )) IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. -* - NX = MAX( 0, ILAENV( 3, 'CUNGQL', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* We use blocked code for the entire construction * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KK = K + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Possibly bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'CUNGQL', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL CUNG2L( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the first block. -* The last kk columns are handled by the block method. +* Factor the first block assuming that our first application +* will be on the Identity matrix * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) + I = 1 + IB = NB * -* Set A(m-kk+1:m,1:n-kk) to zero. +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - DO 20 J = 1, N - KK - DO 10 I = M - KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA) * -* Use unblocked code for the first or only block. +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* Exploit the fact that we are applying to an identity * - CALL CUNG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + CALL CLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', + $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), + $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * - IF( KK.GT.0 ) THEN +* Apply H to rows 1:m-k+i+ib-1 of current block * -* Use blocked code + CALL CUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, + $ TAU( I ), WORK, IINFO ) + +* Use blocked code on the remaining blocks if there are any. * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - IF( N-K+I.GT.1 ) THEN + DO I = NB+1, K, NB * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* The last block may be less than size NB * - CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, - $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK ) + IB = MIN(NB, K-I+1) * -* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL CLARFB( 'Left', 'No transpose', 'Backward', - $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, - $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA, - $ WORK( IB+1 ), LDWORK ) - END IF + CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA ) +* +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* + CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, + $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * * Apply H to rows 1:m-k+i+ib-1 of current block * CALL CUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, $ TAU( I ), WORK, IINFO ) -* -* Set rows m-k+i+ib:m of current block to zero -* - DO 40 J = N - K + I, N - K + I + IB - 1 - DO 30 L = M - K + I + IB, M - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + END DO END IF * WORK( 1 ) = CMPLX( IWS ) diff --git a/SRC/cungqr.f b/SRC/cungqr.f index eb49d2fed5..2544b67a99 100644 --- a/SRC/cungqr.f +++ b/SRC/cungqr.f @@ -137,17 +137,13 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX ZERO - PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, NB, + $ NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL CLARFB, CLARFT, CUNG2R, XERBLA + EXTERNAL CLARFB0C2, CLARFT, CUNG2R, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -163,7 +159,11 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'CUNGQR', ' ', M, N, K, -1 ) - LWKOPT = MAX( 1, N )*NB +* +* Only need a workspace for zung2r in case of bailout +* and for the panel factorization +* + LWKOPT = MAX( 1, N ) WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -192,92 +192,102 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 +* Determine when to cross over from unblocked to blocked + NX = MAX( 0, ILAENV( 3, 'CUNGQR', ' ', M, N, K, -1 ) ) IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. -* - NX = MAX( 0, ILAENV( 3, 'CUNGQR', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* Treat the last NB block starting at KK+1 specially then use our blocking +* method from the block starting at KI+1 to 1 * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KI = K - 2 * NB + KK = K - NB + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Potentially bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'CUNGQR', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL CUNG2R( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the last block. -* The first kk columns are handled by the block method. +* Factor the last block assuming that our first application +* will be on the Identity matrix * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) + I = KK + 1 + IB = NB * -* Set A(1:kk,kk+1:n) to zero. +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 20 J = KK + 1, N - DO 10 I = 1, KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * -* Use unblocked code for the last or only block. +* Apply H to A(i:m,i+ib:n) from the left +* Exploit the fact that we are applying to an identity * - IF( KK.LT.N ) - $ CALL CUNG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + CALL CLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward', + $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), + $ LDA, A(I,I+IB), LDA) * - IF( KK.GT.0 ) THEN +* Apply H to rows i:m of current block * -* Use blocked code + CALL CUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, IINFO) * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.N ) THEN +* Use our standard blocking method after the last block * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) + DO I = KI + 1, 1, -NB + IB = NB * - CALL CLARFT( 'Forward', 'Columnwise', M-I+1, IB, - $ A( I, I ), LDA, TAU( I ), WORK, LDWORK ) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Apply H to A(i:m,i+ib:n) from the left + CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * - CALL CLARFB( 'Left', 'No transpose', 'Forward', - $ 'Columnwise', M-I+1, N-I-IB+1, IB, - $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ), - $ LDA, WORK( IB+1 ), LDWORK ) - END IF +* Apply H to A(i:m,i+ib:n) from the left +* + CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + * * Apply H to rows i:m of current block * - CALL CUNG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + CALL CUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + $ IINFO) + END DO +* +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary +* + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 * -* Set rows 1:i-1 of current block to zero +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 40 J = I, I + IB - 1 - DO 30 L = 1, I - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) +* +* Apply H to A(i:m,i+ib:n) from the left +* + CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + +* +* Apply H to rows i:m of current block +* + CALL CUNG2R(M-I+1, IB, IB, A(I,I), LDA, TAU(I), WORK, + $ IINFO) + END IF END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) diff --git a/SRC/cungrq.f b/SRC/cungrq.f index 7e812b839f..4a9b2fe512 100644 --- a/SRC/cungrq.f +++ b/SRC/cungrq.f @@ -137,17 +137,13 @@ SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX ZERO - PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, II, IINFO, IWS, J, KK, L, LDWORK, + INTEGER I, IB, II, IINFO, IWS, KK, LDWORK, $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL CLARFB, CLARFT, CUNGR2, XERBLA + EXTERNAL CLARFB0C2, CLARFT, CUNGR2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -231,61 +227,67 @@ SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * Use blocked code after the first block. * The last kk rows are handled by the block method. * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) -* -* Set A(1:m-kk,n-kk+1:n) to zero. -* - DO 20 J = N - KK + 1, N - DO 10 I = 1, M - KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KK = K ELSE KK = 0 END IF * -* Use unblocked code for the first or only block. +* Potentially bail to the unblocked code * - CALL CUNGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL CUNGR2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN * -* Use blocked code +* Factor the first block assuming that our first application +* will be on the Identity matrix * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - II = M - K + I - IF( II.GT.1 ) THEN + I = 1 + IB = NB + II = M - K + I * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL CLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, - $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) + CALL CLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) * -* Apply H**H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * - CALL CLARFB( 'Right', 'Conjugate transpose', - $ 'Backward', - $ 'Rowwise', II-1, N-K+I+IB-1, IB, A( II, 1 ), - $ LDA, WORK, LDWORK, A, LDA, WORK( IB+1 ), - $ LDWORK ) - END IF + CALL CLARFB0C2(.TRUE., 'Right', 'Conjugate', 'Backward', + $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, WORK, + $ LDWORK, A, LDA) * -* Apply H**H to columns 1:n-k+i+ib-1 of current block +* Apply H**T to columns 1:n-k+i+ib-1 of current block * - CALL CUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, - $ TAU( I ), - $ WORK, IINFO ) + CALL CUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, + $ TAU( I ), WORK, IINFO ) + + DO I = NB+1, K, NB * -* Set columns n-k+i+ib:n of current block to zero +* The last block may be less than size NB * - DO 40 L = N - K + I + IB, N - DO 30 J = II, II + IB - 1 - A( J, L ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + IB = MIN(NB, K-I+1) + II = M - K + I +* +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) +* + CALL CLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) +* +* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* + CALL CLARFB0C2(.FALSE., 'Right', 'Conjugate', + $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, + $ A(II,1), LDA, WORK, LDWORK, A, LDA) +* +* Apply H**T to columns 1:n-k+i+ib-1 of current block +* + CALL CUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, + $ TAU( I ), WORK, IINFO ) + END DO END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) diff --git a/SRC/slarfb0c2.f b/SRC/slarfb0c2.f index 74a31f6f76..6200e448c5 100644 --- a/SRC/slarfb0c2.f +++ b/SRC/slarfb0c2.f @@ -88,10 +88,10 @@ SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, ! Check to ensure side and trans are the expected values ! IF( .NOT.SIDEL ) THEN - CALL XERBLA('DLARFB0C2', 2) + CALL XERBLA('SLARFB0C2', 2) RETURN ELSE IF(TRANST) THEN - CALL XERBLA('DLARFB0C2', 3) + CALL XERBLA('SLARFB0C2', 3) RETURN END IF ! @@ -168,10 +168,10 @@ SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, ! Check to ensure side and trans are the expected values ! IF( .NOT.SIDER ) THEN - CALL XERBLA('DLARFB0C2', 2) + CALL XERBLA('SLARFB0C2', 2) RETURN ELSE IF(.NOT.TRANST) THEN - CALL XERBLA('DLARFB0C2', 3) + CALL XERBLA('SLARFB0C2', 3) RETURN END IF ! @@ -251,10 +251,10 @@ SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, ! Check to ensure side and trans are the expected values ! IF( .NOT.SIDEL ) THEN - CALL XERBLA('DLARFB0C2', 2) + CALL XERBLA('SLARFB0C2', 2) RETURN ELSE IF(TRANST) THEN - CALL XERBLA('DLARFB0C2', 3) + CALL XERBLA('SLARFB0C2', 3) RETURN END IF ! @@ -331,10 +331,10 @@ SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, ! Check to ensure side and trans are the expected values ! IF( .NOT.SIDER ) THEN - CALL XERBLA('DLARFB0C2', 2) + CALL XERBLA('SLARFB0C2', 2) RETURN ELSE IF(.NOT.TRANST) THEN - CALL XERBLA('DLARFB0C2', 3) + CALL XERBLA('SLARFB0C2', 3) RETURN END IF ! diff --git a/SRC/zlarfb0c2.f b/SRC/zlarfb0c2.f index 3bbf11e268..fab2ca66a9 100644 --- a/SRC/zlarfb0c2.f +++ b/SRC/zlarfb0c2.f @@ -92,10 +92,10 @@ SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, ! Check to ensure side and trans are the expected values ! IF( .NOT.SIDEL ) THEN - CALL XERBLA('DLARFB0C2', 2) + CALL XERBLA('ZLARFB0C2', 2) RETURN ELSE IF(TRANST) THEN - CALL XERBLA('DLARFB0C2', 3) + CALL XERBLA('ZLARFB0C2', 3) RETURN END IF ! @@ -172,10 +172,10 @@ SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, ! Check to ensure side and trans are the expected values ! IF( .NOT.SIDER ) THEN - CALL XERBLA('DLARFB0C2', 2) + CALL XERBLA('ZLARFB0C2', 2) RETURN ELSE IF(.NOT.TRANST) THEN - CALL XERBLA('DLARFB0C2', 3) + CALL XERBLA('ZLARFB0C2', 3) RETURN END IF ! @@ -255,10 +255,10 @@ SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, ! Check to ensure side and trans are the expected values ! IF( .NOT.SIDEL ) THEN - CALL XERBLA('DLARFB0C2', 2) + CALL XERBLA('ZLARFB0C2', 2) RETURN ELSE IF(TRANST) THEN - CALL XERBLA('DLARFB0C2', 3) + CALL XERBLA('ZLARFB0C2', 3) RETURN END IF ! @@ -335,10 +335,10 @@ SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, ! Check to ensure side and trans are the expected values ! IF( .NOT.SIDER ) THEN - CALL XERBLA('DLARFB0C2', 2) + CALL XERBLA('ZLARFB0C2', 2) RETURN ELSE IF(.NOT.TRANST) THEN - CALL XERBLA('DLARFB0C2', 3) + CALL XERBLA('ZLARFB0C2', 3) RETURN END IF ! From 46dd9bd05ae29917bff6ac0306fde6bd20d0ef14 Mon Sep 17 00:00:00 2001 From: Johnathan Rhyne Date: Fri, 7 Mar 2025 15:23:37 -0700 Subject: [PATCH 05/10] Adding documentation to my new functions --- SRC/clarfb0c2.f | 184 +++++++++++++++++++++++++++++++++++++++++++++++- SRC/dlarfb0c2.f | 180 +++++++++++++++++++++++++++++++++++++++++++++- SRC/slarfb0c2.f | 179 ++++++++++++++++++++++++++++++++++++++++++++++ SRC/zlarfb0c2.f | 181 ++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 720 insertions(+), 4 deletions(-) diff --git a/SRC/clarfb0c2.f b/SRC/clarfb0c2.f index 4ab8f47aea..4a7a2018d3 100644 --- a/SRC/clarfb0c2.f +++ b/SRC/clarfb0c2.f @@ -1,3 +1,183 @@ +*> \brief \b CLARFB0C2 applies a block reflector or its conjugate-transpose +* to a rectangular matrix with a 0 block while constructing the explicit Q +* factor +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* +* Definition: +* =========== +* +* SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, +* $ K, V, LDV, T, LDT, C, LDC) +* ! Scalar arguments +* INTEGER M, N, K, LDV, LDC, LDT +* CHARACTER SIDE, TRANS, DIRECT, STOREV +* ! True means that we are assuming C2 is the identity matrix +* ! and thus don't reference whatever is present in C2 +* ! at the beginning. +* LOGICAL C2I +* ! Array arguments +* COMPLEX V(LDV,*), C(LDC,*), T(LDT,*) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLARFB0C2 applies a real block reflector H or its transpose H**H to a +*> complex m by n matrix C with a 0 block, while computing the explicit Q factor +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] C2I +*> \verbatim +*> C2I is LOGICAL +*> = .TRUE.: Assume the nonzero block of C is the identity matrix +*> = .FALSE.: Use existing data in the nonzero block of C +*> \endverbatim +*> +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> = 'L': apply H or H**H from the Left +*> = 'R': apply H or H**H from the Right +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': apply H (No transpose) +*> = 'C': apply H**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] DIRECT +*> \verbatim +*> DIRECT is CHARACTER*1 +*> Indicates how H is formed from a product of elementary +*> reflectors +*> = 'F': H = H(1) H(2) . . . H(k) (Forward) +*> = 'B': H = H(k) . . . H(2) H(1) (Backward) +*> \endverbatim +*> +*> \param[in] STOREV +*> \verbatim +*> STOREV is CHARACTER*1 +*> Indicates how the vectors which define the elementary +*> reflectors are stored: +*> = 'C': Columnwise +*> = 'R': Rowwise +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix C. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix C. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The order of the matrix T (= the number of elementary +*> reflectors whose product defines the block reflector). +*> If SIDE = 'L', M >= K >= 0; +*> if SIDE = 'R', N >= K >= 0. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is COMPLEX array, dimension +*> (LDV,K) if STOREV = 'C' +*> (LDV,M) if STOREV = 'R' and SIDE = 'L' +*> (LDV,N) if STOREV = 'R' and SIDE = 'R' +*> See Further Details. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M); +*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N); +*> if STOREV = 'R', LDV >= K. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX array, dimension (LDT,K) +*> The triangular K-by-K matrix T in the representation of the +*> block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX array, dimension (LDC,N) +*> On entry, the M-by-N matrix C. +*> On exit, C is overwritten by H*C or H**H*C or C*H or C*H**H. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup larfb +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The shape of the matrix V and the storage of the vectors which define +*> the H(i) is best illustrated by the following example with n = 5 and +*> k = 3. The triangular part of V (including its diagonal) is not +*> referenced. +*> +*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': +*> +*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 ) +*> ( v1 1 ) ( 1 v2 v2 v2 ) +*> ( v1 v2 1 ) ( 1 v3 v3 ) +*> ( v1 v2 v3 ) +*> ( v1 v2 v3 ) +*> +*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': +*> +*> V = ( v1 v2 v3 ) V = ( v1 v1 1 ) +*> ( v1 v2 v3 ) ( v2 v2 v2 1 ) +*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) +*> ( 1 v3 ) +*> ( 1 ) +*> \endverbatim +*> +* ===================================================================== SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, $ K, V, LDV, T, LDT, C, LDC) ! Scalar arguments @@ -9,7 +189,7 @@ SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, LOGICAL C2I ! Array arguments - COMPLEX*8 V(LDV,*), C(LDC,*), T(LDT,*) + COMPLEX V(LDV,*), C(LDC,*), T(LDT,*) ! Local scalars LOGICAL QR, LQ, QL, DIRF, COLV, SIDEL, SIDER, $ TRANST @@ -22,7 +202,7 @@ SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, ! External Subroutines EXTERNAL CGEMM, CTRMM ! Parameters - COMPLEX*8 ONE, ZERO, NEG_ONE + COMPLEX ONE, ZERO, NEG_ONE PARAMETER(ONE=(1.0E+0, 0.0E+0), $ ZERO = (0.0E+0, 0.0E+0), $ NEG_ONE = (-1.0E+0, 0.0E+0)) diff --git a/SRC/dlarfb0c2.f b/SRC/dlarfb0c2.f index 4651cdbea4..493580e090 100644 --- a/SRC/dlarfb0c2.f +++ b/SRC/dlarfb0c2.f @@ -1,3 +1,182 @@ +*> \brief \b DLARFB0C2 applies a block reflector or its transpose to a +* rectangular matrix with a 0 block while constructing the explicit Q factor +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* +* Definition: +* =========== +* +* SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, +* $ K, V, LDV, T, LDT, C, LDC) +* ! Scalar arguments +* INTEGER M, N, K, LDV, LDC, LDT +* CHARACTER SIDE, TRANS, DIRECT, STOREV +* ! True means that we are assuming C2 is the identity matrix +* ! and thus don't reference whatever is present in C2 +* ! at the beginning. +* LOGICAL C2I +* ! Array arguments +* DOUBLE PRECISION V(LDV,*), C(LDC,*), T(LDT,*) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLARFB0C2 applies a real block reflector H or its transpose H**T to a +*> real m by n matrix C with a 0 block, while computing the explicit Q factor +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] C2I +*> \verbatim +*> C2I is LOGICAL +*> = .TRUE.: Assume the nonzero block of C is the identity matrix +*> = .FALSE.: Use existing data in the nonzero block of C +*> \endverbatim +*> +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> = 'L': apply H or H**T from the Left +*> = 'R': apply H or H**T from the Right +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': apply H (No transpose) +*> = 'T': apply H**T (Transpose) +*> \endverbatim +*> +*> \param[in] DIRECT +*> \verbatim +*> DIRECT is CHARACTER*1 +*> Indicates how H is formed from a product of elementary +*> reflectors +*> = 'F': H = H(1) H(2) . . . H(k) (Forward) +*> = 'B': H = H(k) . . . H(2) H(1) (Backward) +*> \endverbatim +*> +*> \param[in] STOREV +*> \verbatim +*> STOREV is CHARACTER*1 +*> Indicates how the vectors which define the elementary +*> reflectors are stored: +*> = 'C': Columnwise +*> = 'R': Rowwise +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix C. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix C. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The order of the matrix T (= the number of elementary +*> reflectors whose product defines the block reflector). +*> If SIDE = 'L', M >= K >= 0; +*> if SIDE = 'R', N >= K >= 0. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is DOUBLE PRECISION array, dimension +*> (LDV,K) if STOREV = 'C' +*> (LDV,M) if STOREV = 'R' and SIDE = 'L' +*> (LDV,N) if STOREV = 'R' and SIDE = 'R' +*> The matrix V. See Further Details. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M); +*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N); +*> if STOREV = 'R', LDV >= K. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is DOUBLE PRECISION array, dimension (LDT,K) +*> The triangular k by k matrix T in the representation of the +*> block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (LDC,N) +*> On entry, the m by n matrix C. +*> On exit, C is overwritten by H*C or H**T*C or C*H or C*H**T. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup larfb +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The shape of the matrix V and the storage of the vectors which define +*> the H(i) is best illustrated by the following example with n = 5 and +*> k = 3. The triangular part of V (including its diagonal) is not +*> referenced. +*> +*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': +*> +*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 ) +*> ( v1 1 ) ( 1 v2 v2 v2 ) +*> ( v1 v2 1 ) ( 1 v3 v3 ) +*> ( v1 v2 v3 ) +*> ( v1 v2 v3 ) +*> +*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': +*> +*> V = ( v1 v2 v3 ) V = ( v1 v1 1 ) +*> ( v1 v2 v3 ) ( v2 v2 v2 1 ) +*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) +*> ( 1 v3 ) +*> ( 1 ) +*> \endverbatim +*> +* ===================================================================== SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, $ K, V, LDV, T, LDT, C, LDC) ! Scalar arguments @@ -7,7 +186,6 @@ SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, ! and thus don't reference whatever is present in C2 ! at the beginning. LOGICAL C2I - ! Array arguments DOUBLE PRECISION V(LDV,*), C(LDC,*), T(LDT,*) ! Local scalars diff --git a/SRC/slarfb0c2.f b/SRC/slarfb0c2.f index 6200e448c5..e3c2229045 100644 --- a/SRC/slarfb0c2.f +++ b/SRC/slarfb0c2.f @@ -1,3 +1,182 @@ +*> \brief \b SLARFB0C2 applies a block reflector or its transpose to a +* rectangular matrix with a 0 block while constructing the explicit Q factor +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* +* Definition: +* =========== +* +* SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, +* $ K, V, LDV, T, LDT, C, LDC) +* ! Scalar arguments +* INTEGER M, N, K, LDV, LDC, LDT +* CHARACTER SIDE, TRANS, DIRECT, STOREV +* ! True means that we are assuming C2 is the identity matrix +* ! and thus don't reference whatever is present in C2 +* ! at the beginning. +* LOGICAL C2I +* ! Array arguments +* REAL V(LDV,*), C(LDC,*), T(LDT,*) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLARFB0C2 applies a real block reflector H or its transpose H**T to a +*> real m by n matrix C with a 0 block, while computing the explicit Q factor +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] C2I +*> \verbatim +*> C2I is LOGICAL +*> = .TRUE.: Assume the nonzero block of C is the identity matrix +*> = .FALSE.: Use existing data in the nonzero block of C +*> \endverbatim +*> +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> = 'L': apply H or H**T from the Left +*> = 'R': apply H or H**T from the Right +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': apply H (No transpose) +*> = 'T': apply H**T (Transpose) +*> \endverbatim +*> +*> \param[in] DIRECT +*> \verbatim +*> DIRECT is CHARACTER*1 +*> Indicates how H is formed from a product of elementary +*> reflectors +*> = 'F': H = H(1) H(2) . . . H(k) (Forward) +*> = 'B': H = H(k) . . . H(2) H(1) (Backward) +*> \endverbatim +*> +*> \param[in] STOREV +*> \verbatim +*> STOREV is CHARACTER*1 +*> Indicates how the vectors which define the elementary +*> reflectors are stored: +*> = 'C': Columnwise +*> = 'R': Rowwise +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix C. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix C. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The order of the matrix T (= the number of elementary +*> reflectors whose product defines the block reflector). +*> If SIDE = 'L', M >= K >= 0; +*> if SIDE = 'R', N >= K >= 0. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is REAL array, dimension +*> (LDV,K) if STOREV = 'C' +*> (LDV,M) if STOREV = 'R' and SIDE = 'L' +*> (LDV,N) if STOREV = 'R' and SIDE = 'R' +*> The matrix V. See Further Details. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M); +*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N); +*> if STOREV = 'R', LDV >= K. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is REAL array, dimension (LDT,K) +*> The triangular k by k matrix T in the representation of the +*> block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (LDC,N) +*> On entry, the m by n matrix C. +*> On exit, C is overwritten by H*C or H**T*C or C*H or C*H**T. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup larfb +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The shape of the matrix V and the storage of the vectors which define +*> the H(i) is best illustrated by the following example with n = 5 and +*> k = 3. The triangular part of V (including its diagonal) is not +*> referenced. +*> +*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': +*> +*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 ) +*> ( v1 1 ) ( 1 v2 v2 v2 ) +*> ( v1 v2 1 ) ( 1 v3 v3 ) +*> ( v1 v2 v3 ) +*> ( v1 v2 v3 ) +*> +*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': +*> +*> V = ( v1 v2 v3 ) V = ( v1 v1 1 ) +*> ( v1 v2 v3 ) ( v2 v2 v2 1 ) +*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) +*> ( 1 v3 ) +*> ( 1 ) +*> \endverbatim +*> +* ===================================================================== SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, $ K, V, LDV, T, LDT, C, LDC) ! Scalar arguments diff --git a/SRC/zlarfb0c2.f b/SRC/zlarfb0c2.f index fab2ca66a9..1f43b4b141 100644 --- a/SRC/zlarfb0c2.f +++ b/SRC/zlarfb0c2.f @@ -1,3 +1,183 @@ +*> \brief \b ZLARFB0C2 applies a block reflector or its conjugate-transpose +* to a rectangular matrix with a 0 block while constructing the explicit Q +* factor +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* +* Definition: +* =========== +* +* SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, +* $ K, V, LDV, T, LDT, C, LDC) +* ! Scalar arguments +* INTEGER M, N, K, LDV, LDC, LDT +* CHARACTER SIDE, TRANS, DIRECT, STOREV +* ! True means that we are assuming C2 is the identity matrix +* ! and thus don't reference whatever is present in C2 +* ! at the beginning. +* LOGICAL C2I +* ! Array arguments +* COMPLEX*16 V(LDV,*), C(LDC,*), T(LDT,*) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLARFB0C2 applies a real block reflector H or its transpose H**H to a +*> complex m by n matrix C with a 0 block, while computing the explicit Q factor +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] C2I +*> \verbatim +*> C2I is LOGICAL +*> = .TRUE.: Assume the nonzero block of C is the identity matrix +*> = .FALSE.: Use existing data in the nonzero block of C +*> \endverbatim +*> +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> = 'L': apply H or H**H from the Left +*> = 'R': apply H or H**H from the Right +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': apply H (No transpose) +*> = 'C': apply H**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] DIRECT +*> \verbatim +*> DIRECT is CHARACTER*1 +*> Indicates how H is formed from a product of elementary +*> reflectors +*> = 'F': H = H(1) H(2) . . . H(k) (Forward) +*> = 'B': H = H(k) . . . H(2) H(1) (Backward) +*> \endverbatim +*> +*> \param[in] STOREV +*> \verbatim +*> STOREV is CHARACTER*1 +*> Indicates how the vectors which define the elementary +*> reflectors are stored: +*> = 'C': Columnwise +*> = 'R': Rowwise +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix C. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix C. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The order of the matrix T (= the number of elementary +*> reflectors whose product defines the block reflector). +*> If SIDE = 'L', M >= K >= 0; +*> if SIDE = 'R', N >= K >= 0. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is COMPLEX*16 array, dimension +*> (LDV,K) if STOREV = 'C' +*> (LDV,M) if STOREV = 'R' and SIDE = 'L' +*> (LDV,N) if STOREV = 'R' and SIDE = 'R' +*> See Further Details. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M); +*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N); +*> if STOREV = 'R', LDV >= K. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX*16 array, dimension (LDT,K) +*> The triangular K-by-K matrix T in the representation of the +*> block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX*16 array, dimension (LDC,N) +*> On entry, the M-by-N matrix C. +*> On exit, C is overwritten by H*C or H**H*C or C*H or C*H**H. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup larfb +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The shape of the matrix V and the storage of the vectors which define +*> the H(i) is best illustrated by the following example with n = 5 and +*> k = 3. The triangular part of V (including its diagonal) is not +*> referenced. +*> +*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': +*> +*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 ) +*> ( v1 1 ) ( 1 v2 v2 v2 ) +*> ( v1 v2 1 ) ( 1 v3 v3 ) +*> ( v1 v2 v3 ) +*> ( v1 v2 v3 ) +*> +*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': +*> +*> V = ( v1 v2 v3 ) V = ( v1 v1 1 ) +*> ( v1 v2 v3 ) ( v2 v2 v2 1 ) +*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) +*> ( 1 v3 ) +*> ( 1 ) +*> \endverbatim +*> +* ===================================================================== SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, $ K, V, LDV, T, LDT, C, LDC) ! Scalar arguments @@ -7,7 +187,6 @@ SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, ! and thus don't reference whatever is present in C2 ! at the beginning. LOGICAL C2I - ! Array arguments COMPLEX*16 V(LDV,*), C(LDC,*), T(LDT,*) ! Local scalars From 87c19d1add0735788da978f4fa16a9807076aec2 Mon Sep 17 00:00:00 2001 From: Johnathan Rhyne Date: Fri, 7 Mar 2025 17:47:08 -0700 Subject: [PATCH 06/10] Updating documentation for x{or,un}gy to account for workspace changes --- SRC/cungql.f | 2 -- SRC/cungqr.f | 2 -- SRC/dorgql.f | 2 -- SRC/dorgqr.f | 2 -- SRC/sorgql.f | 2 -- SRC/sorgqr.f | 2 -- SRC/zungql.f | 2 -- SRC/zungqr.f | 2 -- 8 files changed, 16 deletions(-) diff --git a/SRC/cungql.f b/SRC/cungql.f index 893303b4e1..9022d37acc 100644 --- a/SRC/cungql.f +++ b/SRC/cungql.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/SRC/cungqr.f b/SRC/cungqr.f index 2544b67a99..3d7aaceb96 100644 --- a/SRC/cungqr.f +++ b/SRC/cungqr.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/SRC/dorgql.f b/SRC/dorgql.f index 13146142ed..378147844d 100644 --- a/SRC/dorgql.f +++ b/SRC/dorgql.f @@ -97,8 +97,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/SRC/dorgqr.f b/SRC/dorgqr.f index 4b323f68fb..8f3c442783 100644 --- a/SRC/dorgqr.f +++ b/SRC/dorgqr.f @@ -97,8 +97,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/SRC/sorgql.f b/SRC/sorgql.f index 69484d2c36..c9b2918b71 100644 --- a/SRC/sorgql.f +++ b/SRC/sorgql.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/SRC/sorgqr.f b/SRC/sorgqr.f index 65d397de36..070257c46f 100644 --- a/SRC/sorgqr.f +++ b/SRC/sorgqr.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/SRC/zungql.f b/SRC/zungql.f index e3e604e791..2aa5295564 100644 --- a/SRC/zungql.f +++ b/SRC/zungql.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/SRC/zungqr.f b/SRC/zungqr.f index bb5dd9ee03..e3d31a7442 100644 --- a/SRC/zungqr.f +++ b/SRC/zungqr.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns From 318a43d20bc343d7af499dec99960b83ea0819db Mon Sep 17 00:00:00 2001 From: Johnathan Rhyne Date: Fri, 7 Mar 2025 17:51:50 -0700 Subject: [PATCH 07/10] fixed whitespace inconsistencies in subroutine definitions --- SRC/zunglq.f | 2 +- SRC/zungql.f | 2 +- SRC/zungqr.f | 2 +- SRC/zungrq.f | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/SRC/zunglq.f b/SRC/zunglq.f index 9da595f491..54aaf7686e 100644 --- a/SRC/zunglq.f +++ b/SRC/zunglq.f @@ -121,7 +121,7 @@ *> \ingroup unglq * * ===================================================================== - SUBROUTINE ZUNGLQ(M, N, K, A, LDA, TAU, WORK, LWORK, INFO) + SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- diff --git a/SRC/zungql.f b/SRC/zungql.f index 2aa5295564..74c5ec8e65 100644 --- a/SRC/zungql.f +++ b/SRC/zungql.f @@ -120,7 +120,7 @@ *> \ingroup ungql * * ===================================================================== - SUBROUTINE ZUNGQL(M, N, K, A, LDA, TAU, WORK, LWORK, INFO) + SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- diff --git a/SRC/zungqr.f b/SRC/zungqr.f index e3d31a7442..50d5d91a2a 100644 --- a/SRC/zungqr.f +++ b/SRC/zungqr.f @@ -120,7 +120,7 @@ *> \ingroup ungqr * * ===================================================================== - SUBROUTINE ZUNGQR(M, N, K, A, LDA, TAU, WORK, LWORK, INFO) + SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- diff --git a/SRC/zungrq.f b/SRC/zungrq.f index 1a7640d102..939fff3fcf 100644 --- a/SRC/zungrq.f +++ b/SRC/zungrq.f @@ -122,7 +122,7 @@ *> \ingroup ungrq * * ===================================================================== - SUBROUTINE ZUNGRQ(M, N, K, A, LDA, TAU, WORK, LWORK, INFO) + SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- From bf84397ea9cb17508e068a2401a36b256beeb62b Mon Sep 17 00:00:00 2001 From: Johnathan Rhyne Date: Sat, 8 Mar 2025 15:19:06 -0700 Subject: [PATCH 08/10] added definitions to build with _64 using cmake --- SRC/clarfb0c2.f | 4 ++-- SRC/lapack_64.h | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/SRC/clarfb0c2.f b/SRC/clarfb0c2.f index 4a7a2018d3..9b5b739174 100644 --- a/SRC/clarfb0c2.f +++ b/SRC/clarfb0c2.f @@ -178,8 +178,8 @@ *> \endverbatim *> * ===================================================================== - SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, - $ K, V, LDV, T, LDT, C, LDC) + SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, + $ M, N, K, V, LDV, T, LDT, C, LDC) ! Scalar arguments INTEGER M, N, K, LDV, LDC, LDT CHARACTER SIDE, TRANS, DIRECT, STOREV diff --git a/SRC/lapack_64.h b/SRC/lapack_64.h index e8000bf2c4..e8d7920946 100644 --- a/SRC/lapack_64.h +++ b/SRC/lapack_64.h @@ -331,6 +331,7 @@ #define CLARF1F CLARF1F_64 #define CLARF1L CLARF1L_64 #define CLARFB CLARFB_64 +#define CLARFB0C2 CLARFB0C2_64 #define CLARFB_GETT CLARFB_GETT_64 #define CLARFG CLARFG_64 #define CLARFGP CLARFGP_64 @@ -805,6 +806,7 @@ #define DLARF1F DLARF1F_64 #define DLARF1L DLARF1L_64 #define DLARFB DLARFB_64 +#define DLARFB0C2 DLARFB0C2_64 #define DLARFB_GETT DLARFB_GETT_64 #define DLARFG DLARFG_64 #define DLARFGP DLARFGP_64 @@ -1400,6 +1402,7 @@ #define SLARF1F SLARF1F_64 #define SLARF1L SLARF1L_64 #define SLARFB SLARFB_64 +#define SLARFB0C2 SLARFB0C2_64 #define SLARFB_GETT SLARFB_GETT_64 #define SLARFG SLARFG_64 #define SLARFGP SLARFGP_64 @@ -2046,6 +2049,7 @@ #define ZLARF1F ZLARF1F_64 #define ZLARF1L ZLARF1L_64 #define ZLARFB ZLARFB_64 +#define ZLARFB0C2 ZLARFB0C2_64 #define ZLARFB_GETT ZLARFB_GETT_64 #define ZLARFG ZLARFG_64 #define ZLARFGP ZLARFGP_64 From 181b0cc9265ad0a2604e3bb00c2bece7f7972896 Mon Sep 17 00:00:00 2001 From: Johnathan Rhyne Date: Sat, 8 Mar 2025 15:20:17 -0700 Subject: [PATCH 09/10] fixed inconsistent function definitiom of CLARFB0C2 --- SRC/clarfb0c2.f | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SRC/clarfb0c2.f b/SRC/clarfb0c2.f index 9b5b739174..9d788df707 100644 --- a/SRC/clarfb0c2.f +++ b/SRC/clarfb0c2.f @@ -178,8 +178,8 @@ *> \endverbatim *> * ===================================================================== - SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, - $ M, N, K, V, LDV, T, LDT, C, LDC) + SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N + $ K, V, LDV, T, LDT, C, LDC) ! Scalar arguments INTEGER M, N, K, LDV, LDC, LDT CHARACTER SIDE, TRANS, DIRECT, STOREV From 4e3bad0aed6f67e04098c24698c9c61e094aeadd Mon Sep 17 00:00:00 2001 From: Johnathan Rhyne Date: Sat, 8 Mar 2025 15:22:14 -0700 Subject: [PATCH 10/10] adding a dropped comma --- SRC/clarfb0c2.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SRC/clarfb0c2.f b/SRC/clarfb0c2.f index 9d788df707..4a7a2018d3 100644 --- a/SRC/clarfb0c2.f +++ b/SRC/clarfb0c2.f @@ -178,7 +178,7 @@ *> \endverbatim *> * ===================================================================== - SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N + SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, $ K, V, LDV, T, LDT, C, LDC) ! Scalar arguments INTEGER M, N, K, LDV, LDC, LDT