Commit 2d0a6aab authored by mal539's avatar mal539
Browse files

TODO: tried to do apply_nonblocking_sharedLoop.

parent c51a58cb
......@@ -72,7 +72,7 @@ gmres_buffer_size=500
gmres_type=0
use_preconditioner=0
nsegments=1
nsegments=4
arguments="${refine4} ${refine9} ${wave_direction[0]} ${wave_direction[1]} ${wave_direction[2]} ${aca_eta} ${aca_eps} ${solver_eps} ${cluster_base_dim} ${block_cluster_max_dim} ${n_subdomains} ${eval_boundary_refine} ${eval_refine} ${alpha} ${problem_idx} ${max_num_iterations} -1 ${max_rank} ${gmres_buffer_size} ${gmres_type} ${use_preconditioner} ${nsegments}"
......
......@@ -432,6 +432,8 @@ public:
return true;
}
std::cout << "maxIt = " << maxIt << ", restarts = " << m << std::endl;
// initialize vectors and variables
FullMatrix<LO, SC> V( n, m + 1 );
FullMatrix<LO, SC> H( m + 1, m );
......
......@@ -1298,7 +1298,7 @@ void LocalMultiTraceOperator<LO, SC>::apply(
// MPI_Barrier( MPI_COMM_WORLD );
// /* classic multipplication */
// this->apply_no_sync_optimized( x, y, transA, alpha, beta / this->nprocs );
this->apply_no_sync( x, y, transA, alpha, beta / this->nprocs );
/* this->apply_no_sync( x, y, transA, alpha, beta / this->nprocs );
double reft_ = MPI_Wtime();
// MPI_Barrier( MPI_COMM_WORLD );
......@@ -1309,16 +1309,262 @@ void LocalMultiTraceOperator<LO, SC>::apply(
MeasurementExport::add_to_specific_value( reft - reft_, "vector [barrier]", "s" );
MeasurementExport::add_to_specific_value( time_to_gather, "vector [MPI Synchronization]", "s" );
*/
// // non-blocking synchronization
// if( transA ){
// throw runtime_error( "Error: LocalMultiTraceOperator::apply( ... ), non-blocking synchronization scheme does not support transposed apply yet\n" );
// }
// else{
// this->apply_nonblocking( x, y, false, alpha, beta / this->nprocs );
this->apply_nonblocking( x, y, false, alpha, beta / this->nprocs );
//this->apply_nonblocking_sharedLoop( x, y, false, alpha, beta / this->nprocs );
// }
}
template<class LO, class SC>
void LocalMultiTraceOperator<LO, SC>::apply_nonblocking_sharedLoop(
Vector< LO, SC > const & x,
Vector< LO, SC > & y,
bool transA,
SC alpha,
SC beta
) {
double reft = ProgressMonitor::getElapsedTime( );
LO n = y.getLength( );
const int nsegments = this->segmented_order.size( );
std::fill( this->comm_flags.begin(), this->comm_flags.end(), 0 );
/* sparse */
std::vector<std::vector<LO>> segment_indices_PI( nsegments );
std::vector<std::vector<LO>> segment_indices_ID( nsegments );
SC partial_alpha_PI = ( -0.5 - this->PI_alpha );
SC partial_alpha_I = this->PI_alpha;
if(std::abs(partial_alpha_PI) > 0.0){
LO ri, ci;
SC v;
for( LO i = 0; i < this->global_row_indices_PI_GLOBAL.size(); ++i ){
ri = this->global_row_indices_PI_GLOBAL[ i ];
segment_indices_PI[ ri / seglen ].push_back( i );
}
}
if(std::abs(partial_alpha_I) > 0.0){
LO ri, ci;
SC v;
for( LO i = 0; i < this->global_row_indices_ID_GLOBAL.size(); ++i ){
ri = this->global_row_indices_ID_GLOBAL[ i ];
segment_indices_ID[ ri / seglen ].push_back( i );
}
}
#pragma omp parallel
{
Matrix<LO, SC> *block;
SC local_alpha;
LO segment_idx;
std::vector<LO> *innerDOFs_;
std::vector<LO> *outerDOFs_;
LO DOF_shift_inner_;
LO DOF_shift_outer_;
LO tid = omp_get_thread_num( );
LO nts = omp_get_num_threads( );
LO block_type, block_index;
LO ri, ci, idx_;
SC v, mem_;
for( LO i = tid; i < n; i += nts ){
y.scale( i, beta );
}
for( LO j = 0; j < this->active_DOFs.size( ); ++j ){
idx_ = this->active_DOFs[ j ];
this->apply_tmp_openmp[ tid ].set( idx_, 0.0 );
}
LO idx__;
#pragma omp barrier
/* each segment must be computed separately */
for( LO si = 0; si < nsegments; ++si ){
LO s = this->segmented_order[ si ];
/* K */
#pragma omp for schedule( dynamic, 1 ) nowait
for( LO bi = 0; bi < segment_block_indices_K[ s ].size(); ++bi ){
LO K_block_idx = segment_block_indices_K[ s ][ bi ];
block = this->blocks_K[ K_block_idx ];
segment_idx = this->blocks_segment_indices_K[ K_block_idx ];
innerDOFs_ = this->blocks_inner_DOFs_K[ K_block_idx ];
outerDOFs_ = this->blocks_outer_DOFs_K[ K_block_idx ];
DOF_shift_inner_ = this->segment_DOF_shift_inner[ segment_idx ];
DOF_shift_outer_ = this->segment_DOF_shift_outer[ segment_idx ];
this->apply_tmp_local_X[tid].setLength( innerDOFs_->size( ) );
this->apply_tmp_local_Y[tid].setLength( outerDOFs_->size( ) );
for ( LO j = 0; j < innerDOFs_->size( ); ++j ) {
this->apply_tmp_local_X[tid].set( j, this->memx.get( innerDOFs_->at( j ) + DOF_shift_inner_ ) );
}
block->apply( this->apply_tmp_local_X[tid], this->apply_tmp_local_Y[tid], false, 1.0, 0.0 );
for ( LO j = 0; j < outerDOFs_->size( ); ++j ) {
this->apply_tmp_openmp[tid].add( outerDOFs_->at( j ) + DOF_shift_outer_, this->apply_tmp_local_Y[tid].get( j ) );
}
this->apply_tmp_local_X[tid].setLength( outerDOFs_->size( ) );
this->apply_tmp_local_Y[tid].setLength( innerDOFs_->size( ) );
for ( LO j = 0; j < outerDOFs_->size( ); ++j ) {
this->apply_tmp_local_X[tid].set( j, this->memx.get( outerDOFs_->at( j ) + DOF_shift_outer_ ) );
}
block->apply( this->apply_tmp_local_X[tid], this->apply_tmp_local_Y[tid], true, 1.0, 0.0 );
for ( LO j = 0; j < innerDOFs_->size( ); ++j ) {
this->apply_tmp_openmp[tid].add( innerDOFs_->at( j ) + DOF_shift_inner_, this->apply_tmp_local_Y[tid].get( j ) );
}
/* if( tid == 0 ){
for( LO j = 0; j < si; ++j ){
if( this->comm_flags[ this->segmented_order[ j ] ] ){
continue;
}
MPI_Test( &this->comm_requests[ this->segmented_order[ j ] ], &this->comm_flags[ this->segmented_order[ j ] ], MPI_STATUS_IGNORE );
}
}*/
}
/* V */
#pragma omp for schedule( dynamic, 1 ) nowait
for( LO bi = 0; bi < segment_block_indices_V[ s ].size(); ++bi ){
LO V_block_idx = segment_block_indices_V[ s ][ bi ];
block = this->blocks_V[ V_block_idx ];
segment_idx = this->blocks_segment_indices_V[ V_block_idx ];
innerDOFs_ = this->blocks_inner_DOFs_V[ V_block_idx ];
outerDOFs_ = this->blocks_outer_DOFs_V[ V_block_idx ];
DOF_shift_inner_ = this->segment_DOF_shift_inner[ segment_idx ];
DOF_shift_outer_ = this->segment_DOF_shift_outer[ segment_idx ];
this->apply_tmp_local_X[tid].setLength( innerDOFs_->size( ) );
this->apply_tmp_local_Y[tid].setLength( outerDOFs_->size( ) );
for ( LO j = 0; j < innerDOFs_->size( ); ++j ) {
this->apply_tmp_local_X[tid].set( j, this->memx.get( innerDOFs_->at( j ) + DOF_shift_inner_ ) );
}
block->apply( this->apply_tmp_local_X[tid], this->apply_tmp_local_Y[tid], false, 1.0, 0.0 );
for ( LO j = 0; j < outerDOFs_->size( ); ++j ) {
this->apply_tmp_openmp[tid].add( outerDOFs_->at( j ) + DOF_shift_outer_, this->apply_tmp_local_Y[tid].get( j ) );
}
/* if( tid == 0 ){
for( LO j = 0; j < si; ++j ){
if( this->comm_flags[ this->segmented_order[ j ] ] ){
continue;
}
MPI_Test( &this->comm_requests[ this->segmented_order[ j ] ], &this->comm_flags[ this->segmented_order[ j ] ], MPI_STATUS_IGNORE );
}
}*/
}
/* D */
#pragma omp for schedule( dynamic, 1 ) nowait
for( LO bi = 0; bi < segment_block_indices_D[ s ].size(); ++bi ){
LO D_block_idx = segment_block_indices_D[ s ][ bi ];
this->segmented_thread_block_type_indices[ tid ][ s ].push_back( 2 );
this->segmented_thread_block_indices[ tid ][ s ].push_back( D_block_idx );
block = this->blocks_D[ D_block_idx ];
segment_idx = this->blocks_segment_indices_D[ D_block_idx ];
innerDOFs_ = this->blocks_inner_DOFs_D[ D_block_idx ];
outerDOFs_ = this->blocks_outer_DOFs_D[ D_block_idx ];
DOF_shift_inner_ = this->segment_DOF_shift_inner[ segment_idx ];
DOF_shift_outer_ = this->segment_DOF_shift_outer[ segment_idx ];
this->apply_tmp_local_X[tid].setLength( innerDOFs_->size( ) );
this->apply_tmp_local_Y[tid].setLength( outerDOFs_->size( ) );
for ( LO j = 0; j < innerDOFs_->size( ); ++j ) {
this->apply_tmp_local_X[tid].set( j, this->memx.get( innerDOFs_->at( j ) + DOF_shift_inner_ ) );
}
block->apply( this->apply_tmp_local_X[tid], this->apply_tmp_local_Y[tid], false, 1.0, 0.0 );
for ( LO j = 0; j < outerDOFs_->size( ); ++j ) {
this->apply_tmp_openmp[tid].add( outerDOFs_->at( j ) + DOF_shift_outer_, this->apply_tmp_local_Y[tid].get( j ) );
}
/* if( tid == 0 ){
for( LO j = 0; j < si; ++j ){
if( this->comm_flags[ this->segmented_order[ j ] ] ){
continue;
}
MPI_Test( &this->comm_requests[ this->segmented_order[ j ] ], &this->comm_flags[ this->segmented_order[ j ] ], MPI_STATUS_IGNORE );
}
} */
}
/* sparse */
#pragma omp for schedule( dynamic, 100 ) nowait
for( LO i = 0; i < segment_indices_PI[ s ].size(); ++i ){
idx__ = segment_indices_PI[ s ][ i ];
this->apply_tmp_openmp[tid].add( this->global_row_indices_PI_GLOBAL[ idx__ ], this->global_values_PI_GLOBAL[ idx__ ] * this->apply_tmp_local_X[tid].get(this->global_col_indices_PI_GLOBAL[ idx__ ]) );
}
#pragma omp for schedule( dynamic, 100 ) nowait
for( LO i = 0; i < segment_indices_ID[ s ].size(); ++i ){
idx__ = segment_indices_ID[ s ][ i ];
this->apply_tmp_openmp[tid].add(this->global_row_indices_ID_GLOBAL[ idx__ ], this->global_values_ID_GLOBAL[ idx__ ] * this->apply_tmp_local_X[tid].get(this->global_col_indices_ID_GLOBAL[ idx__ ]) );
}
/*
for( LO i = 0; i < this->segmented_thread_block_type_indices[ tid ][ s ].size(); ++i ){
block_type = this->segmented_thread_block_type_indices[ tid ][ s ][ i ];
block_index = this->segmented_thread_block_indices[ tid ][ s ][ i ];
*/
SC mem_;
#pragma omp barrier
for( LO j = tid; j < this->segmented_active_DOFs[ s ].size( ); j += nts ){
mem_ = 0.0;
idx_ = this->segmented_active_DOFs[ s ][ j ];
for( LO t = 0; t < nts; ++t ){
mem_ += this->apply_tmp_openmp[ t ].get( idx_ );
}
y.add( idx_, mem_ );
}
#pragma omp barrier
if( tid == 0 )
{
MPI_Iallreduce( MPI_IN_PLACE, y.getData() + this->comm_start[ s ], this->comm_start[ s + 1 ] - this->comm_start[ s ], GetType<LO, SC>::MPI_SC( ), MPI_SUM, MPI_COMM_WORLD, &this->comm_requests[ s ] );
}
}//end of segment iterations
}//end of pragma omp region
MeasurementExport::add_to_specific_value( ProgressMonitor::getElapsedTime( ) - reft, "apply", "s", true );
double reft_ = ProgressMonitor::getElapsedTime( );
//MPI_Barrier( MPI_COMM_WORLD );
reft = ProgressMonitor::getElapsedTime( );
MeasurementExport::add_to_specific_value( reft - reft_, "vector [barrier]", "s" );
MPI_Waitall( nsegments, this->comm_requests, MPI_STATUS_IGNORE );
// MPI_Allreduce( MPI_IN_PLACE, y.getData( ), y.getLength( ), GetType<LO, SC>::MPI_SC( ), MPI_SUM, MPI_COMM_WORLD );
double tf = ProgressMonitor::getElapsedTime( );
MeasurementExport::add_to_specific_value( tf - reft, "vector [MPI Synchronization]", "s" );
}
template<class LO, class SC>
void LocalMultiTraceOperator<LO, SC>::apply_nonblocking(
Vector< LO, SC > const & x,
......@@ -2005,7 +2251,7 @@ void LocalMultiTraceOperator<LO, SC>::apply_no_sync(
}
// /* hypersingular potential */
#pragma omp for schedule( dynamic, 1 ) nowait
#pragma omp for schedule( dynamic, 1 )
for ( LO i = 0; i < blocks_D.size( ); ++i ) {
block = blocks_D[ i ];
segment_idx = blocks_segment_indices_D[ i ];
......
......@@ -132,6 +132,14 @@ public:
SC beta = 0.0
);
virtual void apply_nonblocking_sharedLoop(
Vector< LO, SC > const & x,
Vector< LO, SC > & y,
bool transA = false,
SC alpha = 1.0,
SC beta = 0.0
);
virtual void apply(
MPIDistributedVector< LO, SC > & x,
MPIDistributedVector< LO, SC > & y,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment