Commit 04a94419 authored by mal539's avatar mal539
Browse files

INIT: bem4i [master] initial commit.

parents
/build/
/dist/
tests/
/traces/
/set-0/
output/
*.zip
awk 'NR%3==0' decompositions.txt > decomp01.txt
sed 's/ /, /g' decomp01.txt | sed 's/, $//g' > decomp02.txt
awk '{print "{"$0"},"}' decomp02.txt > decomp03.txt
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
3
3
8
1.0 -1.0 -1.0
1.0 1.0 -1.0
-1.0 1.0 -1.0
-1.0 -1.0 -1.0
1.0 -1.0 1.0
1.0 1.0 1.0
-1.0 1.0 1.0
-1.0 -1.0 1.0
12
0 1 5
0 5 4
1 2 6
1 6 5
2 7 6
2 3 7
3 0 4
3 4 7
0 3 1
2 1 3
4 5 6
4 6 7
This diff is collapsed.
This diff is collapsed.
#!/bin/sh
ml Eigen
ml intel/2018a
#ml intel/2017c
#ml intel
#ml intel
#ml imkl
ml METIS
#ml TotalView
#ml Score-P/4.1-intel-2018a
#ml Score-P/5.0-intel-2018a
#ml TotalView
#ml Score-P
#ml GCCcore/8.3.0
#ml GCC/8.3.0-2.32
#ml binutils/2.32-GCCcore-8.3.0
#ml ifort/2019.4.227-GCC-8.3.0-2.32
#ml icc/2019.4.227-GCC-8.3.0-2.32
#ml iccifort/2019.4.227-GCC-8.3.0-2.32
#ml intel/2019.03-GCC-8.3.0-2.32
#ml impi/2018.4.274-iccifort-2018.5.274-GCC-6.3.0-2.27
#ml imkl/2019.3.199-iimpi-2019.03-GCC-8.3.0-2.32
#ml iimpi/2017a
#ml imkl/2018.4.274-iompi-2018.04
make clean CONF=release_intel
make CONF=release_intel
#make clean CONF=debug_intel
#make CONF=debug_intel
#!/bin/sh
ml Eigen
ml intel/2018a
#ml intel/2017c
#ml intel
#ml intel
#ml imkl
ml METIS
#ml VTune/2018_update4
#ml TotalView
#ml Score-P/4.1-intel-2018a
#ml Score-P/5.0-intel-2018a
ml Valgrind/3.15.0-intel-2018.04
#ml GCCcore/8.3.0
#ml GCC/8.3.0-2.32
#ml binutils/2.32-GCCcore-8.3.0
#ml ifort/2019.4.227-GCC-8.3.0-2.32
#ml icc/2019.4.227-GCC-8.3.0-2.32
#ml iccifort/2019.4.227-GCC-8.3.0-2.32
#ml intel/2019.03-GCC-8.3.0-2.32
#ml impi/2019.3.199-iccifort-2019.4.227-GCC-8.3.0-2.32
#ml imkl/2019.3.199-iimpi-2019.03-GCC-8.3.0-2.32
#ml impi/2018.4.274-iccifort-2018.5.274-GCC-6.3.0-2.27
#ml Forge/18.2.3
#ml Score-P/5.0-intel-2018a
#ml Scalasca/2.5-intel-2018a
#export SCOREP_EXPERIMENT_DIRECTORY=Score-P/scorep_sum
#export SCOREP_TOTAL_MEMORY=128384000
#export SCOREP_PROFILING_MAX_CALLPATH_DEPTH=87
#export SCOREP_PROFILING_ENABLE_CORE_FILES=1
#cp ${HOME_DIR}dist/release_intel/Intel-Linux/bem4i ${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_comm_test
#test_prefix=COMM_
#executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_comm_test"
test_prefix=OVERLAP_
executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_novel"
#executable="${HOME_DIR}dist/release_intel_scorep/Intel-Linux/bem4i"
test_prefix=BLOCKING_
executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_classic"
test_index=1
#REFINEMENT OPTIONS
refine4=4
refine9=1
#TEST PROBLEM OPTIONS
wave_direction=(1 0 1)
alpha=0 # alpha >= 0.0, when alpha=0.0 then only the transmission operator is used
problem_idx=0
#ACA OPTIONS
aca_eta=1.0
aca_eps=1e-6
cluster_base_dim=10
block_cluster_max_dim=10000
n_subdomains=2
max_rank=-1
#EVALUATION CUT OPTIONS
eval_boundary_refine=1e-3
eval_refine=6
#SOLVER OPTIONS
solver_eps=1e-6
max_num_iterations=5
gmres_buffer_size=500
#0 - classic, 1 - distributed
gmres_type=0
use_preconditioner=0
nsegments=4
arguments="${refine4} ${refine9} ${wave_direction[0]} ${wave_direction[1]} ${wave_direction[2]} ${aca_eta} ${aca_eps} ${solver_eps} ${cluster_base_dim} ${block_cluster_max_dim} ${n_subdomains} ${eval_boundary_refine} ${eval_refine} ${alpha} ${problem_idx} ${max_num_iterations} -1 ${max_rank} ${gmres_buffer_size} ${gmres_type} ${use_preconditioner} ${nsegments}"
export KMP_AFFINITY=verbose,granularity=fine,scatter
export OMP_PROC_BIND=true
export OMP_DYNAMIC=false
export OMP_WAIT_POLICY=active
export OMP_NUM_THREADS=6
export I_MPI_ASYNC_PROGRESS=0
export I_MPI_ASYNC_PROGRESS_THREADS=0
##export LD_LIBARY_PATH=$LD_LIBRARY_PATH:/home/mal539/POP/bem4i_kernel_reduction/bem4i_MK/Score-P
##echo $LD_LIBRARY_PATH
#export SCOREP_ENABLE_PROFILING=true
#export SCOREP_ENABLE_TRACING=false
#export SCOREP_EXPERIMENT_DIRECTORY=profile
echo mpirun -np 4 ${executable} ${arguments}
#mpirun -np 4 ${executable} ${arguments} > /home/kra568/__BEM4I/mereni/MTF/OVERLAP_1_P12_MPI002_N00002_MS10_BS10000_ACAEPS1e-6_SOLVERT0_PREC1.txt
mpirun -np 4 valgrind --track-origins=yes ./${executable} ${arguments}
#mpirun -np 2 -env LD_PRELOAD $EBROOTVALGRIND/lib/valgrind/libmpiwrap-amd64-linux.so valgrind ./dist/release_intel/Intel-Linux/bem4i ${arguments}
#!/bin/sh
#PBS -N BEM4I_MTF
#PBS -l select=4:ncpus=24:mpiprocs=4:ompthreads=6
#PBS -l walltime=04:00:00
#PBS -q qprod
#PBS -A DD-19-15
ml Eigen
ml intel/2018a
ml METIS
echo $PBS_O_WORKDIR
cd $PBS_O_WORKDIR/
pwd
# load extrae module
. /home/mal539/POP/bem4i_kernel_reduction/bem4i_MK/set_env.sh
ml IntelMPI-2018a
export TRACE_NAME=MKbem4iKernal.prv
export OMP_NUM_THREADS=6
export OMP_PROC_BIND=true
export OMP_DYNAMIC=false
export KMP_AFFINITY=verbose,granularity=fine,scatter
export I_MPI_ASYNC_PROGRESS=0
export I_MPI_ASYNC_PROGRESS_THREADS=0
#test_prefix=COMM_
#executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_comm_test"
#test_prefix=OVERLAP_
#executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_novel"
#ex_name=bem4i_novel
test_prefix=BLOCKING_
executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_classic"
#executable="${HOME_DIR}dist/release_intel/Intel-Linux/bem4i_classic_ompDYN1"
ex_name=bem4i_classic
test_index=1
#REFINEMENT OPTIONS
refine4=4
refine9=1
#TEST PROBLEM OPTIONS
wave_direction=(1 0 1)
alpha=0 # alpha >= 0.0, when alpha=0.0 then only the transmission operator is used
problem_idx=0
#ACA OPTIONS
aca_eta=1.0
aca_eps=1e-6
cluster_base_dim=10
block_cluster_max_dim=10000
n_subdomains=2
max_rank=-1
#EVALUATION CUT OPTIONS
eval_boundary_refine=1e-3
eval_refine=6
#SOLVER OPTIONS
solver_eps=1e-6
max_num_iterations=5
gmres_buffer_size=500
#0 - classic, 1 - distributed
gmres_type=0
use_preconditioner=0
nsegments=4
arguments="${refine4} ${refine9} ${wave_direction[0]} ${wave_direction[1]} ${wave_direction[2]} ${aca_eta} ${aca_eps} ${solver_eps} ${cluster_base_dim} ${block_cluster_max_dim} ${n_subdomains} ${eval_boundary_refine} ${eval_refine} ${alpha} ${problem_idx} ${max_num_iterations} -1 ${max_rank} ${gmres_buffer_size} ${gmres_type} ${use_preconditioner} ${nsegments}"
mpi=16
echo mpirun -np ${mpi} ${executable} ${arguments}
#mpirun -np ${mpi} ./${executable} ${arguments}
echo mpirun -np ${mpi} trace.sh ${executable} ${arguments}
mpirun -np ${mpi} ./trace.sh ./${executable} ${arguments}
mv ${ex_name}.prv /home/mal539/POP/bem4i_kernel_reduction/bem4i_MK/traces/omp_schedule/${ex_name}_ref${refine4}_${refine9}_mpi${mpi}.prv
mv ${ex_name}.row /home/mal539/POP/bem4i_kernel_reduction/bem4i_MK/traces/omp_schedule/${ex_name}_ref${refine4}_${refine9}_mpi${mpi}.row
mv ${ex_name}.pcf /home/mal539/POP/bem4i_kernel_reduction/bem4i_MK/traces/omp_schedule/${ex_name}_ref${refine4}_${refine9}_mpi${mpi}.pcf
This diff is collapsed.
/*!
* @file ACAKernel.h
* @author Dalibor Lukas
* @author Michal Merta
* @author Jan Zapletal
* @date June 16, 2014
* @brief Header file for class ACAMatrix
*
*/
#ifndef ACAMATRIX_H
#define ACAMATRIX_H
#include <iostream>
#include <fstream>
#include <set>
#include <algorithm>
#include <iterator>
#include "FullMatrix.h"
#include "LowRankMatrix.h"
#include "FastBESpace.h"
namespace bem4i {
template<class LO, class SC>
class ACAMatrix : public Matrix<LO, SC> {
typedef typename GetType<LO, SC>::SCVT SCVT;
public:
//! default constructor
ACAMatrix( );
/*!
* Constructor allocating a full matrix
*
* @param[in] nRows number of rows
* @param[in] nCols number of columns
*/
ACAMatrix( LO nRows, LO nCols );
//! destructor
virtual ~ACAMatrix( );
//! adds full nonadmissible block to the list of nonadmissible blocks
/*
inline void addNonadmissibleBlock(
FullMatrix<LO, SC>* block
) {
nonAdmissibleBlocks.push_back( block );
#pragma omp atomic update
nonadmBlocksSize += block->getNCols( ) * block->getNRows( );
}
*/
//! adds full nonadmissible block associated with idx-th nonadmissible leaf to the list of nonadmissible blocks
inline void addNonadmissibleBlock(
FullMatrix<LO, SC>* block,
LO idx
) {
nonAdmissibleBlocks.at( idx ) = block;
if(block != nullptr){
#pragma omp atomic update
nonadmBlocksSize += block->getNCols() * block->getNRows();
}
}
//! adds list of nonadmissible cluster pairs
inline void setNonadmissibleDOFs(
std::vector< BEBlockCluster< LO, SC > * > & leaves
) {
// this->nonadmissibleInnerDOFs.clear();
// this->nonadmissibleOuterDOFs.clear();
this->nonadmissibleInnerDOFs.reserve( leaves.size( ) );
this->nonadmissibleOuterDOFs.reserve( leaves.size( ) );
for ( LO i = 0; i < leaves.size( ); ++i ) {
// printf("full block %5d\n", i);
// printf("inner DOFs: ");
// for(auto i: *leaves[ i ]->innerDOFs){ printf("%d, ", i); }
// printf("\nouter DOFs: ");
// for(auto i: *leaves[ i ]->outerDOFs){ printf("%d, ", i); }
// printf("\n");
this->nonadmissibleInnerDOFs.push_back( leaves[ i ]->innerDOFs );
this->nonadmissibleOuterDOFs.push_back( leaves[ i ]->outerDOFs );
// this->nonadmissibleInnerDOFs.push_back(
// new std::vector< LO >( *( leaves[ i ]->innerDOFs ) ) );
// this->nonadmissibleOuterDOFs.push_back(
// new std::vector< LO >( *( leaves[ i ]->outerDOFs ) ) );
if ( leaves.at( i )->innerDOFs->size( ) > this->maxBlockSize ) {
this->maxBlockSize = leaves.at( i )->innerDOFs->size( );
}
if ( leaves.at( i )->outerDOFs->size( ) > this->maxBlockSize ) {
this->maxBlockSize = leaves.at( i )->outerDOFs->size( );
}
}
}
//! adds list of admissible cluster pairs
inline void setAdmissibleDOFs(
std::vector< BEBlockCluster< LO, SC > * > & leaves
) {
// this->admissibleInnerDOFs.clear();
// this->admissibleOuterDOFs.clear();
this->admissibleInnerDOFs.reserve( leaves.size( ) );
this->admissibleOuterDOFs.reserve( leaves.size( ) );
for ( LO i = 0; i < leaves.size( ); ++i ) {
// printf( " ACA block %5d\n", i );
// printf( "inner DOFs: " );
// for ( auto i : *leaves[ i ]->innerDOFs ) {
// printf( "%d, ", i );
// }
// printf( "\nouter DOFs: " );
// for ( auto i : *leaves[ i ]->outerDOFs ) {
// printf( "%d, ", i );
// }
// printf( "\n" );
this->admissibleInnerDOFs.push_back( leaves[ i ]->innerDOFs );
this->admissibleOuterDOFs.push_back( leaves[ i ]->outerDOFs );
// this->admissibleInnerDOFs.push_back(
// new std::vector< LO >( *( leaves[ i ]->innerDOFs ) ) );
// this->admissibleOuterDOFs.push_back(
// new std::vector< LO >( *( leaves[ i ]->outerDOFs ) ) );
if ( leaves.at( i )->innerDOFs->size( ) > this->maxBlockSize ) {
this->maxBlockSize = leaves.at( i )->innerDOFs->size( );
}
if ( leaves.at( i )->outerDOFs->size( ) > this->maxBlockSize ) {
this->maxBlockSize = leaves.at( i )->outerDOFs->size( );
}
}
}
//! adds a pair of U, V matrices from ACA
inline void addAdmissibleBlock(
LowRankMatrix<LO, SC>* B,
LO idx
) {
admissibleBlocks[ idx ] = B;
if(B) {
#pragma omp atomic update
admBlocksSize += B->get_n_entries();
}
}
inline double getAverageRankOfAdmissibleBlocks() const;
//! applies matrix to a vector
virtual void apply(
Vector<LO, SC> const &x,
Vector<LO, SC> &y,
bool transA = false,
SC alpha = 1.0,
SC beta = 0.0
);
inline SCVT getCompressionRatio( ) {
LO nCols = this->getNCols( );
LO nRows = this->getNRows( );
if ( (double)nCols * (double)nRows <= 0 ) return -1.0;
return (SCVT) ( ( nonadmBlocksSize + admBlocksSize ) /
( (SCVT) nCols * (SCVT) nRows ) );
}
inline void resizeNonAdmBlocks(
LO size
) {
this->nonAdmissibleBlocks.resize( size );
}
inline void resizeAdmBlocks(
LO size
) {
this->admissibleBlocks.resize( size );
}
void print(
std::ostream &stream = std::cout
) const {
std::cout << "ACA Matrix\n";
std::cout << "Number of rows: " << this->nRows << std::endl;
std::cout << "Number of cols: " << this->nCols << std::endl;
};
void print_values(std::ostream &stream = std::cout, bool active_reporter = true);
void setP12p1dis(
bool p12p1dis
) {
this->p12p1dis = p12p1dis;
}
void setP1dis2p1(
bool p1dis2p1
) {
this->p1dis2p1 = p1dis2p1;
}
bool getP12p1dis(
) {
return this->p12p1dis;
}
bool getP1dis2p1(
) {
return this->p1dis2p1;
}
LO getNNonadmissibleBlocks(){
LO output = 0;
for(auto B: this->nonAdmissibleBlocks){
if(B){
if(B->getNRows() * B->getNCols() > 0){
output++;
}
}
}
return output;
}
// LO getNNonadmissibleNullSizeBlocks(){
//
// LO rows = 0, cols = 0, output = 0;
//
// for( LO i = 0; i < this->nonAdmissibleBlocks.size(); i++ ){
//
// if( this->nonAdmissibleBlocks.at( i ) == nullptr ){
// output++;
// continue;
// }
// rows = this->nonAdmissibleBlocks.at( i )->getNRows( );
// cols = this->nonAdmissibleBlocks.at( i )->getNCols( );
//
// if( rows * cols <= 0 ){
// output++;
// }
//
// }
// return output;
// }
LO getNAdmissibleBlocks(){
LO output = 0;
for(auto B: this->admissibleBlocks){
if(B){
if(B->get_n_entries() > 0){
output++;
}
}
}
return output;
}
// LO getNAdmissibleNullSizeBlocks(){
// LO rows = 0, cols = 0, output = 0;
//
// FullMatrix< LO, SC > *fP = nullptr, *sP = nullptr;
//
// for( LO i = 0; i < this->admissibleBlocks.size(); i++ ){
//
// fP = admissibleBlocks[i].first;
// sP = admissibleBlocks[i].second;
//
// if( fP != nullptr ){
// rows = fP->getNRows( );
// cols = fP->getNCols( );
//
// if( rows * cols <= 0 ){
// output++;
//
// continue;
// }
// }
// else{
// output++;
// continue;
// }
//
// if( sP != nullptr ){
// rows = sP->getNRows( );
// cols = sP->getNCols( );
//
// if( rows * cols <= 0 ){
// output++;
// }
// }
//
// }
// return output;
// }
LO getMaxBlockSizeElements(){
return this->maxBlockSizeElems;
}
void calculateBlockAnalysis();
void getBlockAnalysis(LO &nF, LO &nN, LO &aF, LO &aS, LO &aN);
void printBEBlockClusters(
LO &nonAdmissibleIdx_full,
LO &nonAdmissibleIdx_null,
LO &admissibleIdx_sparse,
LO &admissibleIdx_full,