Commit c51a58cb authored by mal539's avatar mal539
Browse files

Testing - nowait, oneChunkPerThread.

parent 2e14c72a
......@@ -30,14 +30,14 @@ export I_MPI_ASYNC_PROGRESS_THREADS=0
#test_prefix=COMM_
#executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_comm_test"
#test_prefix=OVERLAP_
#executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_novel"
#ex_name=bem4i_novel
test_prefix=OVERLAP_
executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_novel"
ex_name=bem4i_novel
test_prefix=BLOCKING_
executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_classic"
#executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_classic_ompDYN1"
ex_name=bem4i_classic
#test_prefix=BLOCKING_
#executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_classic"
#executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_classic_chunk_oneperthread_nowait"
#ex_name=bem4i_classic
test_index=1
......@@ -72,7 +72,7 @@ gmres_buffer_size=500
gmres_type=0
use_preconditioner=0
nsegments=4
nsegments=1
arguments="${refine4} ${refine9} ${wave_direction[0]} ${wave_direction[1]} ${wave_direction[2]} ${aca_eta} ${aca_eps} ${solver_eps} ${cluster_base_dim} ${block_cluster_max_dim} ${n_subdomains} ${eval_boundary_refine} ${eval_refine} ${alpha} ${problem_idx} ${max_num_iterations} -1 ${max_rank} ${gmres_buffer_size} ${gmres_type} ${use_preconditioner} ${nsegments}"
......@@ -84,6 +84,6 @@ echo mpirun -np ${mpi} trace.sh ${executable} ${arguments}
mpirun -np ${mpi} ./trace.sh ./${executable} ${arguments}
mv ${ex_name}.prv /home/mal539/POP/bem4i_kernel_reduction/bem4i_MK/traces/omp_schedule/${ex_name}_ref${refine4}_${refine9}_mpi${mpi}.prv
mv ${ex_name}.row /home/mal539/POP/bem4i_kernel_reduction/bem4i_MK/traces/omp_schedule/${ex_name}_ref${refine4}_${refine9}_mpi${mpi}.row
mv ${ex_name}.pcf /home/mal539/POP/bem4i_kernel_reduction/bem4i_MK/traces/omp_schedule/${ex_name}_ref${refine4}_${refine9}_mpi${mpi}.pcf
mv ${ex_name}.prv /home/mal539/POP/bem4i_kernel_reduction/bem4i_MK/traces/${ex_name}_ref${refine4}_${refine9}_mpi${mpi}.prv
mv ${ex_name}.row /home/mal539/POP/bem4i_kernel_reduction/bem4i_MK/traces/${ex_name}_ref${refine4}_${refine9}_mpi${mpi}.row
mv ${ex_name}.pcf /home/mal539/POP/bem4i_kernel_reduction/bem4i_MK/traces/${ex_name}_ref${refine4}_${refine9}_mpi${mpi}.pcf
......@@ -1315,7 +1315,7 @@ void LocalMultiTraceOperator<LO, SC>::apply(
// throw runtime_error( "Error: LocalMultiTraceOperator::apply( ... ), non-blocking synchronization scheme does not support transposed apply yet\n" );
// }
// else{
// this->apply_nonblocking( x, y, false, alpha, beta / this->nprocs );
// this->apply_nonblocking( x, y, false, alpha, beta / this->nprocs );
// }
}
......@@ -1440,7 +1440,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_nonblocking(
}
}
if( tid == 0 ){
/* if( tid == 0 ){
for( LO j = 0; j < si - 1; ++j ){
if( this->comm_flags[ this->segmented_order[ j ] ] ){
......@@ -1450,7 +1450,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_nonblocking(
MPI_Test( &this->comm_requests[ this->segmented_order[ j ] ], &this->comm_flags[ this->segmented_order[ j ] ], MPI_STATUS_IGNORE );
}
}
}*/
}
/* sparse */
for( LO j = 0; j < segmented_thread_sparse_rows[ tid ][ s ].size(); ++j ){
......@@ -1908,7 +1908,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
/* double layer potential */
#pragma omp for schedule( dynamic, 8 )
#pragma omp for schedule( dynamic, 1 )
for ( LO i = 0; i < blocks_K.size( ); ++i ) {
block = blocks_K[ i ];
segment_idx = blocks_segment_indices_K[ i ];
......@@ -1939,9 +1939,9 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
}
}
#pragma omp barrier
// #pragma omp barrier
/* adjoint double layer potential */
#pragma omp for schedule( dynamic, 8 )
#pragma omp for schedule( dynamic, 1 ) nowait
for ( LO i = 0; i < blocks_K.size( ); ++i ) {
block = blocks_K[ i ];
segment_idx = blocks_segment_indices_K[ i ];
......@@ -1973,7 +1973,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
}
// /* single layer potential */
#pragma omp for schedule( dynamic, 8 )
#pragma omp for schedule( dynamic, 1 ) nowait
for ( LO i = 0; i < blocks_V.size( ); ++i ) {
block = blocks_V[ i ];
segment_idx = blocks_segment_indices_V[ i ];
......@@ -2005,7 +2005,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
}
// /* hypersingular potential */
#pragma omp for schedule( dynamic, 8 )
#pragma omp for schedule( dynamic, 1 ) nowait
for ( LO i = 0; i < blocks_D.size( ); ++i ) {
block = blocks_D[ i ];
segment_idx = blocks_segment_indices_D[ i ];
......@@ -2036,13 +2036,16 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
}
}
if(std::abs(partial_alpha_PI) > 0.0){
LO CHUNKSIZE;
if(std::abs(partial_alpha_PI) > 0.0){
LO n = this->global_row_indices_PI_GLOBAL.size();
LO ri, ci;
SC v;
CHUNKSIZE = (LO) (n / nts) / 1;
if( transA ){
#pragma omp for schedule( dynamic, 8 )
#pragma omp for schedule( dynamic, CHUNKSIZE )
for( LO i = 0; i < n; ++i ){
ri = this->global_row_indices_PI_GLOBAL[ i ];
ci = this->global_col_indices_PI_GLOBAL[ i ];
......@@ -2052,7 +2055,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
}
}
else{
#pragma omp for schedule( dynamic, 8 )
#pragma omp for schedule( dynamic, CHUNKSIZE )
for( LO i = 0; i < n; ++i ){
ri = this->global_row_indices_PI_GLOBAL[ i ];
ci = this->global_col_indices_PI_GLOBAL[ i ];
......@@ -2069,7 +2072,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
SC v;
if( transA ){
#pragma omp for schedule( dynamic, 8 )
#pragma omp for schedule( dynamic, 500 )
for( LO i = 0; i < n; ++i ){
ri = this->global_row_indices_ID_GLOBAL[ i ];
ci = this->global_col_indices_ID_GLOBAL[ i ];
......@@ -2079,7 +2082,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
}
}
else{
#pragma omp for schedule( dynamic, 8 )
#pragma omp for schedule( dynamic, 500 )
for( LO i = 0; i < n; ++i ){
ri = this->global_row_indices_ID_GLOBAL[ i ];
ci = this->global_col_indices_ID_GLOBAL[ i ];
......@@ -2100,7 +2103,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
}
y.add( active_DOFs[ j ], mem_ );
}
}
} // end of parallel region
MeasurementExport::add_to_specific_value( ProgressMonitor::getElapsedTime( ) - reft, "apply", "s", true );
// C - 0.5*PI + a*(I-PI)
......@@ -2985,7 +2988,7 @@ void LocalMultiTraceOperator<LO, SC>::init_comm_data( LO nsegments ) {
}
}
/* sparse */
#pragma omp for schedule( dynamic, 1 ) nowait
#pragma omp for schedule( dynamic, 100 ) nowait
for( LO i = 0; i < segment_indices_PI[ s ].size(); ++i ){
idx__ = segment_indices_PI[ s ][ i ];
this->segmented_thread_sparse_rows[ tid ][ s ].push_back( this->global_row_indices_PI_GLOBAL[ idx__ ] );
......@@ -2995,7 +2998,7 @@ void LocalMultiTraceOperator<LO, SC>::init_comm_data( LO nsegments ) {
this->apply_tmp_openmp[tid].add( this->global_row_indices_PI_GLOBAL[ idx__ ], this->global_values_PI_GLOBAL[ idx__ ] * this->apply_tmp_local_X[tid].get(this->global_col_indices_PI_GLOBAL[ idx__ ]) );
}
#pragma omp for schedule( dynamic, 1 ) nowait
#pragma omp for schedule( dynamic, 100 ) nowait
for( LO i = 0; i < segment_indices_ID[ s ].size(); ++i ){
idx__ = segment_indices_ID[ s ][ i ];
this->segmented_thread_sparse_rows[ tid ][ s ].push_back( this->global_row_indices_ID_GLOBAL[ idx__ ] );
......@@ -3519,7 +3522,7 @@ void LocalMultiTraceOperator<LO, SC>::init_apply_data( ) {
LO ri, ci;
SC v;
#pragma omp for schedule( dynamic, 1 ) nowait
#pragma omp for schedule( dynamic, 100 ) nowait
for( LO i = 0; i < n; ++i ){
ri = this->global_row_indices_PI_GLOBAL[ i ];
ci = this->global_col_indices_PI_GLOBAL[ i ];
......@@ -3538,7 +3541,7 @@ void LocalMultiTraceOperator<LO, SC>::init_apply_data( ) {
LO ri, ci;
SC v;
#pragma omp for schedule( dynamic, 1) nowait
#pragma omp for schedule( dynamic, 100) nowait
for( LO i = 0; i < n; ++i ){
ri = this->global_row_indices_ID_GLOBAL[ i ];
ci = this->global_col_indices_ID_GLOBAL[ i ];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment