Skip to content

Commit

Permalink
Reorder and uniformize cuda and hip bodies
Browse files Browse the repository at this point in the history
Signed-off-by: Aurelien Bouteiller <[email protected]>
  • Loading branch information
abouteiller committed Jun 27, 2023
1 parent e63d3e7 commit 7015aa6
Showing 1 changed file with 96 additions and 94 deletions.
190 changes: 96 additions & 94 deletions src/zpotrf_L.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -178,24 +178,24 @@ END
BODY [type=HIP]
{
int tempkm = k == descA->mt-1 ? descA->m - k*descA->mb : descA->mb;
int ldak = BLKLDD( descA, k );
int ldak = LDA(descA, T);

rocblas_status status;
rocblas_fill rocblas_uplo;
dplasma_potrf_gpu_workspaces_t *wp;
int *d_iinfo;
dplasma_hip_handles_t *handles;

if( PlasmaLower == uplo )
rocblas_uplo = rocblas_fill_lower;
if( PlasmaUpper == uplo )
rocblas_uplo = rocblas_fill_upper;

dplasma_hip_handles_t *handles;
handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey);
assert(NULL != handles);

wp = parsec_info_get(&gpu_device->super.infos, hip_workspaces_infokey);
assert(NULL != wp);

d_iinfo = (int*)wp->tmpmem;

status = rocsolver_zpotrf( handles->hipblas_handle, rocblas_uplo, tempkm, T, ldak, d_iinfo);
Expand Down Expand Up @@ -250,6 +250,42 @@ RW C <- (k == 0) ? ddescA(m, k) [ type = %{ return ADTT_

; (m >= (descA->mt - PRI_CHANGE)) ? (descA->mt - m) * (descA->mt - m) * (descA->mt - m) + 3 * ((2 * descA->mt) - k - m - 1) * (m - k) : PRI_MAX

BODY [type=RECURSIVE]
{
int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb;

if ( (tempmm > smallnb) || (descA->nb > smallnb) )
{
subtile_desc_t *small_descT;
subtile_desc_t *small_descC;
parsec_taskpool_t* parsec_ztrsm;


small_descT = subtile_desc_create( descA, k, k,
smallnb, smallnb, 0, 0, descA->nb, descA->nb );
small_descT->mat = T;

small_descC = subtile_desc_create( descA, m, k,
smallnb, smallnb, 0, 0, tempmm, descA->nb );
small_descC->mat = C;

parsec_ztrsm = dplasma_ztrsm_New(dplasmaRight, dplasmaLower,
dplasmaConjTrans, dplasmaNonUnit,
(dplasma_complex64_t)1.0,
(parsec_tiled_matrix_t *)small_descT,
(parsec_tiled_matrix_t *)small_descC );

parsec_recursivecall((parsec_task_t*)this_task,
parsec_ztrsm, dplasma_ztrsm_Destruct,
2, small_descT, small_descC );

return PARSEC_HOOK_RETURN_ASYNC;
}
/* Go for the sequential CPU version */
return PARSEC_HOOK_RETURN_NEXT;
}
END

BODY [type=CUDA]
{
int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb;
Expand All @@ -269,24 +305,22 @@ BODY [type=CUDA]
CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT,
tempmm, descA->nb,
&zone, T, ldak, C, ldam);
PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status,
{return PARSEC_HOOK_RETURN_ERROR;} );
PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} );
}
END

BODY [type=HIP]
{
int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb;
int ldak = LDA(ddescA, T);
int ldam = LDA(ddescA, C);
dplasma_hip_handles_t *handles;
#if defined(PRECISION_z) || defined(PRECISION_c)
hipblasDoubleComplex zone = { 1., 0. };
#else
double zone = 1.;
#endif
int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb;
int ldak = LDA(ddescA, T);
int ldam = LDA(ddescA, C);

hipblasStatus_t status;
dplasma_hip_handles_t *handles;
handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey);
assert(NULL != handles);
status = hipblasZtrsm(handles->hipblas_handle,
Expand All @@ -298,42 +332,6 @@ BODY [type=HIP]
}
END

BODY [type=RECURSIVE]
{
int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb;

if ( (tempmm > smallnb) || (descA->nb > smallnb) )
{
subtile_desc_t *small_descT;
subtile_desc_t *small_descC;
parsec_taskpool_t* parsec_ztrsm;


small_descT = subtile_desc_create( descA, k, k,
smallnb, smallnb, 0, 0, descA->nb, descA->nb );
small_descT->mat = T;

small_descC = subtile_desc_create( descA, m, k,
smallnb, smallnb, 0, 0, tempmm, descA->nb );
small_descC->mat = C;

parsec_ztrsm = dplasma_ztrsm_New(dplasmaRight, dplasmaLower,
dplasmaConjTrans, dplasmaNonUnit,
(dplasma_complex64_t)1.0,
(parsec_tiled_matrix_t *)small_descT,
(parsec_tiled_matrix_t *)small_descC );

parsec_recursivecall((parsec_task_t*)this_task,
parsec_ztrsm, dplasma_ztrsm_Destruct,
2, small_descT, small_descC );

return PARSEC_HOOK_RETURN_ASYNC;
}
/* Go for the sequential CPU version */
return PARSEC_HOOK_RETURN_NEXT;
}
END

BODY
{
int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb;
Expand Down Expand Up @@ -406,14 +404,14 @@ END

BODY [type=HIP]
{
double zone = 1.;
double mzone = -1.;
int tempmm = m == descA->mt-1 ? descA->m - m*descA->mb : descA->mb;
int ldam_A = LDA(ddescA, A);
int ldam_T = LDA(ddescA, T);

hipblasStatus_t status;
dplasma_hip_handles_t *handles;
double zone = 1.;
double mzone = -1.;
hipblasStatus_t status;

handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey);
assert(NULL != handles);
status = hipblasZherk( handles->hipblas_handle,
Expand Down Expand Up @@ -506,6 +504,47 @@ RW C <- (k == 0) ? ddescA(m, n) [ type = %{ return ADTT_

; (m >= (descA->mt - PRI_CHANGE)) ? (descA->mt - m) * (descA->mt - m) * (descA->mt - m) + 3 * ((2 * descA->mt) - m - n - 3) * (m - n) + 6 * (m - k) : PRI_MAX

BODY [type=RECURSIVE]
{
int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb;

if ( (tempmm > smallnb) || (descA->nb > smallnb) )
{
subtile_desc_t *small_descA;
subtile_desc_t *small_descB;
subtile_desc_t *small_descC;
parsec_taskpool_t *parsec_zgemm;

small_descA = subtile_desc_create( descA, m, k,
smallnb, smallnb, 0, 0, tempmm, descA->nb );
small_descA->mat = A;

small_descB = subtile_desc_create( descA, n, k,
smallnb, smallnb, 0, 0, descA->mb, descA->nb );
small_descB->mat = B;

small_descC = subtile_desc_create( descA, m, n,
smallnb, smallnb, 0, 0, tempmm, descA->nb );
small_descC->mat = C;

parsec_zgemm = dplasma_zgemm_New(dplasmaNoTrans, dplasmaConjTrans,
(dplasma_complex64_t)-1.0,
(parsec_tiled_matrix_t *)small_descA,
(parsec_tiled_matrix_t *)small_descB,
(dplasma_complex64_t) 1.0,
(parsec_tiled_matrix_t *)small_descC);

parsec_recursivecall((parsec_task_t*)this_task,
parsec_zgemm, dplasma_zgemm_Destruct,
3, small_descA, small_descB, small_descC );

return PARSEC_HOOK_RETURN_ASYNC;
}
/* Go to CPU sequential kernel */
return PARSEC_HOOK_RETURN_NEXT;
}
END

BODY [type=CUDA
A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
Expand All @@ -525,13 +564,12 @@ BODY [type=CUDA
int ldam_A = LDA(ddescA, A);
int ldan_B = LDA(ddescA, B);
int ldam_C = LDA(ddescA, C);

dplasma_cuda_handles_t *handles;
cublasStatus_t status;
assert( ldam_A <= descA->mb );
assert( ldan_B <= descA->mb );
assert( ldam_C <= descA->mb );

cublasStatus_t status;
dplasma_cuda_handles_t *handles;
handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey);
assert(NULL != handles);

Expand All @@ -547,6 +585,10 @@ BODY [type=CUDA
END

BODY [type=HIP
A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%}
A.dc=ddescA B.dc=ddescA C.dc=ddescA
stage_in=dplasma_hip_lapack_stage_in
stage_out=dplasma_hip_lapack_stage_out]
{
Expand All @@ -569,6 +611,7 @@ BODY [type=HIP
dplasma_hip_handles_t *handles;
handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey);
assert(NULL != handles);

status = hipblasZgemm( handles->hipblas_handle,
HIPBLAS_OP_N, HIPBLAS_OP_C,
tempmm, descA->mb, descA->mb,
Expand All @@ -579,47 +622,6 @@ BODY [type=HIP
}
END

BODY [type=RECURSIVE]
{
int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb;

if ( (tempmm > smallnb) || (descA->nb > smallnb) )
{
subtile_desc_t *small_descA;
subtile_desc_t *small_descB;
subtile_desc_t *small_descC;
parsec_taskpool_t *parsec_zgemm;

small_descA = subtile_desc_create( descA, m, k,
smallnb, smallnb, 0, 0, tempmm, descA->nb );
small_descA->mat = A;

small_descB = subtile_desc_create( descA, n, k,
smallnb, smallnb, 0, 0, descA->mb, descA->nb );
small_descB->mat = B;

small_descC = subtile_desc_create( descA, m, n,
smallnb, smallnb, 0, 0, tempmm, descA->nb );
small_descC->mat = C;

parsec_zgemm = dplasma_zgemm_New(dplasmaNoTrans, dplasmaConjTrans,
(dplasma_complex64_t)-1.0,
(parsec_tiled_matrix_t *)small_descA,
(parsec_tiled_matrix_t *)small_descB,
(dplasma_complex64_t) 1.0,
(parsec_tiled_matrix_t *)small_descC);

parsec_recursivecall((parsec_task_t*)this_task,
parsec_zgemm, dplasma_zgemm_Destruct,
3, small_descA, small_descB, small_descC );

return PARSEC_HOOK_RETURN_ASYNC;
}
/* Go to CPU sequential kernel */
return PARSEC_HOOK_RETURN_NEXT;
}
END

BODY
{
int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb;
Expand Down

0 comments on commit 7015aa6

Please sign in to comment.