48 #include "Kokkos_Core.hpp" 53 #ifdef KOKKOS_HAVE_CUDA 54 #include "cuda_runtime_api.h" 58 #include <sys/types.h> 61 int main(
int argc,
char *argv[])
67 const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
68 const size_t num_cores_per_socket =
69 Kokkos::hwloc::get_available_cores_per_numa();
70 const size_t num_threads_per_core =
71 Kokkos::hwloc::get_available_threads_per_core();
76 "This test performance of MP::Vector FEM assembly.\n");
78 CLP.
setOption(
"n", &nGrid,
"Number of mesh points in each direction. Set to zero to use a range");
80 CLP.
setOption(
"n-begin", &nGridBegin,
"Beginning number of mesh points in each direction.");
82 CLP.
setOption(
"n-end", &nGridEnd,
"Ending number of mesh points in each direction.");
84 CLP.
setOption(
"n-step", &nGridStep,
"Increment in number of mesh points in each direction.");
86 CLP.
setOption(
"ni", &nIter,
"Number of assembly iterations");
88 CLP.
setOption(
"print",
"no-print", &print,
"Print debugging output");
91 bool quadratic =
false;
92 CLP.
setOption(
"quadratic",
"linear", &quadratic,
"Use quadratic basis functions");
93 int num_cores = num_cores_per_socket * num_sockets;
95 "Number of CPU cores to use (defaults to all)");
96 int num_hyper_threads = num_threads_per_core;
97 CLP.
setOption(
"hyperthreads", &num_hyper_threads,
98 "Number of hyper threads per core to use (defaults to all)");
99 #ifdef KOKKOS_HAVE_PTHREAD 101 CLP.
setOption(
"threads",
"no-threads", &threads,
"Enable Threads device");
103 #ifdef KOKKOS_HAVE_OPENMP 105 CLP.
setOption(
"openmp",
"no-openmp", &openmp,
"Enable OpenMP device");
107 #ifdef KOKKOS_HAVE_CUDA 109 CLP.
setOption(
"cuda",
"no-cuda", &cuda,
"Enable Cuda device");
111 CLP.
setOption(
"device", &device_id,
"CUDA device ID.");
114 CLP.
setOption(
"vtune",
"no-vtune", &vtune,
"connect to vtune");
115 CLP.
parse( argc, argv );
124 std::stringstream cmd;
125 pid_t my_os_pid=getpid();
126 const std::string vtune_loc =
128 const std::string output_dir =
"./vtune/vtune.0";
130 <<
" -collect hotspots -result-dir " << output_dir
131 <<
" -target-pid " << my_os_pid <<
" &";
132 std::cout << cmd.str() << std::endl;
133 system(cmd.str().c_str());
137 #ifdef KOKKOS_HAVE_PTHREAD 139 typedef Kokkos::Threads Device;
141 Kokkos::Threads::initialize(num_cores*num_hyper_threads);
143 std::cout << std::endl
144 <<
"Threads performance with " << num_cores*num_hyper_threads
145 <<
" threads:" << std::endl;
147 performance_test_driver<Device>(
148 print, nIter, nGridBegin, nGridEnd, nGridStep, quadratic,
check);
150 Kokkos::Threads::finalize();
154 #ifdef KOKKOS_HAVE_OPENMP 156 typedef Kokkos::OpenMP Device;
158 Kokkos::OpenMP::initialize(num_cores*num_hyper_threads);
160 std::cout << std::endl
161 <<
"OpenMP performance with " << num_cores*num_hyper_threads
162 <<
" threads:" << std::endl;
164 performance_test_driver<Device>(
165 print, nIter, nGridBegin, nGridEnd, nGridStep, quadratic,
check);
167 Kokkos::OpenMP::finalize();
171 #ifdef KOKKOS_HAVE_CUDA 173 typedef Kokkos::Cuda Device;
175 Kokkos::HostSpace::execution_space::initialize();
176 Kokkos::Cuda::initialize(Kokkos::Cuda::SelectDevice(device_id));
178 cudaDeviceProp deviceProp;
179 cudaGetDeviceProperties(&deviceProp, device_id);
180 std::cout << std::endl
181 <<
"CUDA performance performance with device " << device_id
183 << deviceProp.name <<
"):" 186 performance_test_driver<Device>(
187 print, nIter, nGridBegin, nGridEnd, nGridStep, quadratic,
check);
189 Kokkos::HostSpace::execution_space::finalize();
190 Kokkos::Cuda::finalize();
int main(int argc, char *argv[])
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
void setDocString(const char doc_string[])