Commit 47a65492 authored by Lukas Maly's avatar Lukas Maly
Browse files

Merge branch 'chunksize-500' into 'master'

Chunksize 500

See merge request kernels/bem4i-solver!1
parents 7c24bb30 729b67f6
#!/bin/sh
#PBS -N BEM4I_MTF
#PBS -l select=4:ncpus=24:mpiprocs=4:ompthreads=6
#PBS -l walltime=04:00:00
#PBS -q qprod
#PBS -A DD-19-15
ml Eigen
ml intel/2018a
ml METIS
echo $PBS_O_WORKDIR
cd $PBS_O_WORKDIR/
pwd
# load extrae module
. /home/mal539/POP/bem4i_kernel_reduction/bem4i_MK/set_env.sh
ml IntelMPI-2018a
export TRACE_NAME=MKbem4iKernal.prv
export OMP_NUM_THREADS=6
export OMP_PROC_BIND=true
export OMP_DYNAMIC=false
export KMP_AFFINITY=verbose,granularity=fine,scatter
export I_MPI_ASYNC_PROGRESS=0
export I_MPI_ASYNC_PROGRESS_THREADS=0
#test_prefix=COMM_
#executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_comm_test"
test_prefix=OVERLAP_
executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_novel"
ex_name=bem4i_novel
#test_prefix=BLOCKING_
#executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_classic"
#executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_classic_chunk_oneperthread_nowait"
#ex_name=bem4i_classic
test_index=1
#REFINEMENT OPTIONS
refine4=4
refine9=1
#TEST PROBLEM OPTIONS
wave_direction=(1 0 1)
alpha=0 # alpha >= 0.0, when alpha=0.0 then only the transmission operator is used
problem_idx=0
#ACA OPTIONS
aca_eta=1.0
aca_eps=1e-6
cluster_base_dim=10
block_cluster_max_dim=10000
n_subdomains=2
max_rank=-1
#EVALUATION CUT OPTIONS
eval_boundary_refine=1e-3
eval_refine=6
#SOLVER OPTIONS
solver_eps=1e-6
max_num_iterations=5
gmres_buffer_size=500
#0 - classic, 1 - distributed
gmres_type=0
use_preconditioner=0
nsegments=1
arguments="${refine4} ${refine9} ${wave_direction[0]} ${wave_direction[1]} ${wave_direction[2]} ${aca_eta} ${aca_eps} ${solver_eps} ${cluster_base_dim} ${block_cluster_max_dim} ${n_subdomains} ${eval_boundary_refine} ${eval_refine} ${alpha} ${problem_idx} ${max_num_iterations} -1 ${max_rank} ${gmres_buffer_size} ${gmres_type} ${use_preconditioner} ${nsegments}"
mpi=16
echo mpirun -np ${mpi} ${executable} ${arguments}
#mpirun -np ${mpi} ./${executable} ${arguments}
echo mpirun -np ${mpi} trace.sh ${executable} ${arguments}
mpirun -np ${mpi} ./trace.sh ./${executable} ${arguments}
mv ${ex_name}.prv /home/mal539/POP/bem4i_kernel_reduction/bem4i_MK/traces/${ex_name}_ref${refine4}_${refine9}_mpi${mpi}.prv
mv ${ex_name}.row /home/mal539/POP/bem4i_kernel_reduction/bem4i_MK/traces/${ex_name}_ref${refine4}_${refine9}_mpi${mpi}.row
mv ${ex_name}.pcf /home/mal539/POP/bem4i_kernel_reduction/bem4i_MK/traces/${ex_name}_ref${refine4}_${refine9}_mpi${mpi}.pcf
......@@ -1295,13 +1295,11 @@ void LocalMultiTraceOperator<LO, SC>::apply(
SC alpha,
SC beta
) {
// MPI_Barrier( MPI_COMM_WORLD );
// /* classic multipplication */
// this->apply_no_sync_optimized( x, y, transA, alpha, beta / this->nprocs );
// version 1 - master
this->apply_no_sync( x, y, transA, alpha, beta / this->nprocs );
double reft_ = MPI_Wtime();
// MPI_Barrier( MPI_COMM_WORLD );
double reft = MPI_Wtime();
MPI_Allreduce( MPI_IN_PLACE, y.getData( ), y.getLength( ), GetType<LO, SC>::MPI_SC( ), MPI_SUM, MPI_COMM_WORLD );
double time_to_gather = MPI_Wtime() - reft;
......@@ -1310,13 +1308,10 @@ void LocalMultiTraceOperator<LO, SC>::apply(
MeasurementExport::add_to_specific_value( time_to_gather, "vector [MPI Synchronization]", "s" );
// // non-blocking synchronization
// if( transA ){
// throw runtime_error( "Error: LocalMultiTraceOperator::apply( ... ), non-blocking synchronization scheme does not support transposed apply yet\n" );
// }
// else{
// this->apply_nonblocking( x, y, false, alpha, beta / this->nprocs );
// }
// version 2 - non-blocking + automatically correct chunk-size
// non-blocking synchronization
// this->apply_nonblocking( x, y, false, alpha, beta / this->nprocs );
}
template<class LO, class SC>
......@@ -1440,7 +1435,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_nonblocking(
}
}
if( tid == 0 ){
/* if( tid == 0 ){
for( LO j = 0; j < si - 1; ++j ){
if( this->comm_flags[ this->segmented_order[ j ] ] ){
......@@ -1450,7 +1445,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_nonblocking(
MPI_Test( &this->comm_requests[ this->segmented_order[ j ] ], &this->comm_flags[ this->segmented_order[ j ] ], MPI_STATUS_IGNORE );
}
}
}*/
}
/* sparse */
for( LO j = 0; j < segmented_thread_sparse_rows[ tid ][ s ].size(); ++j ){
......@@ -1908,7 +1903,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
/* double layer potential */
#pragma omp for schedule( dynamic, 8 )
#pragma omp for schedule( dynamic, 1 )
for ( LO i = 0; i < blocks_K.size( ); ++i ) {
block = blocks_K[ i ];
segment_idx = blocks_segment_indices_K[ i ];
......@@ -1939,9 +1934,9 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
}
}
#pragma omp barrier
// #pragma omp barrier
/* adjoint double layer potential */
#pragma omp for schedule( dynamic, 8 )
#pragma omp for schedule( dynamic, 1 ) nowait
for ( LO i = 0; i < blocks_K.size( ); ++i ) {
block = blocks_K[ i ];
segment_idx = blocks_segment_indices_K[ i ];
......@@ -1973,7 +1968,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
}
// /* single layer potential */
#pragma omp for schedule( dynamic, 8 )
#pragma omp for schedule( dynamic, 1 ) nowait
for ( LO i = 0; i < blocks_V.size( ); ++i ) {
block = blocks_V[ i ];
segment_idx = blocks_segment_indices_V[ i ];
......@@ -2005,7 +2000,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
}
// /* hypersingular potential */
#pragma omp for schedule( dynamic, 8 )
#pragma omp for schedule( dynamic, 1 ) nowait
for ( LO i = 0; i < blocks_D.size( ); ++i ) {
block = blocks_D[ i ];
segment_idx = blocks_segment_indices_D[ i ];
......@@ -2036,13 +2031,18 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
}
}
if(std::abs(partial_alpha_PI) > 0.0){
LO CHUNKSIZE;
if(std::abs(partial_alpha_PI) > 0.0){
LO n = this->global_row_indices_PI_GLOBAL.size();
LO ri, ci;
SC v;
// define chunk-size
CHUNKSIZE = (LO) (n / nts) / 1;
CHUNKSIZE = 500;
if( transA ){
#pragma omp for schedule( dynamic, 8 )
#pragma omp for schedule( dynamic, CHUNKSIZE )
for( LO i = 0; i < n; ++i ){
ri = this->global_row_indices_PI_GLOBAL[ i ];
ci = this->global_col_indices_PI_GLOBAL[ i ];
......@@ -2052,7 +2052,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
}
}
else{
#pragma omp for schedule( dynamic, 8 )
#pragma omp for schedule( dynamic, CHUNKSIZE )
for( LO i = 0; i < n; ++i ){
ri = this->global_row_indices_PI_GLOBAL[ i ];
ci = this->global_col_indices_PI_GLOBAL[ i ];
......@@ -2069,7 +2069,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
SC v;
if( transA ){
#pragma omp for schedule( dynamic, 8 )
#pragma omp for schedule( dynamic, CHUNKSIZE )
for( LO i = 0; i < n; ++i ){
ri = this->global_row_indices_ID_GLOBAL[ i ];
ci = this->global_col_indices_ID_GLOBAL[ i ];
......@@ -2079,7 +2079,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
}
}
else{
#pragma omp for schedule( dynamic, 8 )
#pragma omp for schedule( dynamic, CHUNKSIZE )
for( LO i = 0; i < n; ++i ){
ri = this->global_row_indices_ID_GLOBAL[ i ];
ci = this->global_col_indices_ID_GLOBAL[ i ];
......@@ -2100,16 +2100,9 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
}
y.add( active_DOFs[ j ], mem_ );
}
}
} // end of parallel region
MeasurementExport::add_to_specific_value( ProgressMonitor::getElapsedTime( ) - reft, "apply", "s", true );
// C - 0.5*PI + a*(I-PI)
// if(std::abs(partial_alpha_PI) > 0.0){
// this->PI_GLOBAL.apply( x, y, transA, partial_alpha_PI * alpha, 1.0 );
// }
// if(std::abs(partial_alpha_I) > 0.0){
// this->ID_GLOBAL.apply( x, y, transA, partial_alpha_I * alpha, 1.0 );
// }
}
template<class LO, class SC>
......@@ -2985,7 +2978,7 @@ void LocalMultiTraceOperator<LO, SC>::init_comm_data( LO nsegments ) {
}
}
/* sparse */
#pragma omp for schedule( dynamic, 1 ) nowait
#pragma omp for schedule( dynamic, 100 ) nowait
for( LO i = 0; i < segment_indices_PI[ s ].size(); ++i ){
idx__ = segment_indices_PI[ s ][ i ];
this->segmented_thread_sparse_rows[ tid ][ s ].push_back( this->global_row_indices_PI_GLOBAL[ idx__ ] );
......@@ -2995,7 +2988,7 @@ void LocalMultiTraceOperator<LO, SC>::init_comm_data( LO nsegments ) {
this->apply_tmp_openmp[tid].add( this->global_row_indices_PI_GLOBAL[ idx__ ], this->global_values_PI_GLOBAL[ idx__ ] * this->apply_tmp_local_X[tid].get(this->global_col_indices_PI_GLOBAL[ idx__ ]) );
}
#pragma omp for schedule( dynamic, 1 ) nowait
#pragma omp for schedule( dynamic, 100 ) nowait
for( LO i = 0; i < segment_indices_ID[ s ].size(); ++i ){
idx__ = segment_indices_ID[ s ][ i ];
this->segmented_thread_sparse_rows[ tid ][ s ].push_back( this->global_row_indices_ID_GLOBAL[ idx__ ] );
......@@ -3519,7 +3512,7 @@ void LocalMultiTraceOperator<LO, SC>::init_apply_data( ) {
LO ri, ci;
SC v;
#pragma omp for schedule( dynamic, 1 ) nowait
#pragma omp for schedule( dynamic, 100 ) nowait
for( LO i = 0; i < n; ++i ){
ri = this->global_row_indices_PI_GLOBAL[ i ];
ci = this->global_col_indices_PI_GLOBAL[ i ];
......@@ -3538,7 +3531,7 @@ void LocalMultiTraceOperator<LO, SC>::init_apply_data( ) {
LO ri, ci;
SC v;
#pragma omp for schedule( dynamic, 1) nowait
#pragma omp for schedule( dynamic, 100) nowait
for( LO i = 0; i < n; ++i ){
ri = this->global_row_indices_ID_GLOBAL[ i ];
ci = this->global_col_indices_ID_GLOBAL[ i ];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment