44 #include "Kokkos_Sparse.hpp" 47 #include "impl/Kokkos_Timer.hpp" 49 template<
typename IntType >
56 return k + N * (
j + N * i );
61 std::vector< std::vector<size_t> > & graph )
63 graph.resize( N * N * N , std::vector<size_t>() );
67 for (
int i = 0 ; i < (
int) N ; ++i ) {
68 for (
int j = 0 ;
j < (
int) N ; ++
j ) {
69 for (
int k = 0 ; k < (
int) N ; ++k ) {
73 graph[row].reserve(27);
75 for (
int ii = -1 ; ii < 2 ; ++ii ) {
76 for (
int jj = -1 ; jj < 2 ; ++jj ) {
77 for (
int kk = -1 ; kk < 2 ; ++kk ) {
78 if ( 0 <= i + ii && i + ii < (
int) N &&
79 0 <=
j + jj &&
j + jj < (
int) N &&
80 0 <= k + kk && k + kk < (
int) N ) {
83 graph[row].push_back(col);
86 total += graph[row].size();
92 template <
typename ScalarType,
typename OrdinalType,
typename Device>
95 const OrdinalType nGrid,
96 const OrdinalType iterCount,
97 std::vector<double>& scalar_perf,
98 std::vector<double>& block_left_perf,
99 std::vector<double>& block_right_perf)
104 typedef Kokkos::View< value_type*, execution_space > vector_type;
105 typedef Kokkos::View< value_type**, Kokkos::LayoutLeft, execution_space > left_multivec_type;
107 typedef KokkosSparse::CrsMatrix< value_type, ordinal_type, execution_space > matrix_type;
108 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
109 typedef typename matrix_type::values_type matrix_values_type;
114 std::vector< std::vector<size_t> > fem_graph;
115 const size_t fem_length = nGrid * nGrid * nGrid;
121 std::vector<vector_type>
x(ensemble_length);
122 std::vector<vector_type>
y(ensemble_length);
124 x[e] = vector_type(Kokkos::ViewAllocateWithoutInitializing(
"x"), fem_length);
125 y[e] = vector_type(Kokkos::ViewAllocateWithoutInitializing(
"y"), fem_length);
130 left_multivec_type xl(Kokkos::ViewAllocateWithoutInitializing(
"xl"), fem_length, ensemble_length);
131 left_multivec_type yl(Kokkos::ViewAllocateWithoutInitializing(
"yl"), fem_length, ensemble_length);
142 matrix_graph_type matrix_graph =
143 Kokkos::create_staticcrsgraph<matrix_graph_type>(
144 std::string(
"test crs graph"), fem_graph);
145 matrix_values_type matrix_values =
146 matrix_values_type(Kokkos::ViewAllocateWithoutInitializing(
"matrix"), graph_length);
147 matrix_type matrix(
"matrix", fem_length, matrix_values, matrix_graph);
161 execution_space::fence();
162 Kokkos::Impl::Timer clock ;
168 execution_space::fence();
170 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
171 const double flops = 1.0e-9 * 2.0 * graph_length * ensemble_length;
173 scalar_perf.resize(5);
174 scalar_perf[0] = fem_length;
175 scalar_perf[1] = ensemble_length;
176 scalar_perf[2] = graph_length;
177 scalar_perf[3] = seconds_per_iter;
178 scalar_perf[4] = flops / seconds_per_iter;
190 execution_space::fence();
191 Kokkos::Impl::Timer clock ;
195 execution_space::fence();
197 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
198 const double flops = 1.0e-9 * 2.0 * graph_length * ensemble_length;
200 block_left_perf.resize(5);
201 block_left_perf[0] = fem_length;
202 block_left_perf[1] = ensemble_length;
203 block_left_perf[2] = graph_length;
204 block_left_perf[3] = seconds_per_iter;
205 block_left_perf[4] = flops / seconds_per_iter;
218 execution_space::fence();
219 Kokkos::Impl::Timer clock ;
223 execution_space::fence();
225 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
226 const double flops = 1.0e-9 * 2.0 * graph_length * ensemble_length;
228 block_right_perf.resize(5);
229 block_right_perf[0] = fem_length;
230 block_right_perf[1] = ensemble_length;
231 block_right_perf[2] = graph_length;
232 block_right_perf[3] = seconds_per_iter;
233 block_right_perf[4] = flops / seconds_per_iter;
239 template <
typename Scalar,
typename Ordinal,
typename Device>
246 std::cout.precision(8);
247 std::cout << std::endl
248 <<
"\"Grid Size\" , " 250 <<
"\"FEM Graph Size\" , " 251 <<
"\"Ensemble Size\" , " 252 <<
"\"Scalar SpMM Time\" , " 253 <<
"\"Scalar SpMM Speedup\" , " 254 <<
"\"Scalar SpMM GFLOPS\" , " 255 <<
"\"Block-Left SpMM Speedup\" , " 256 <<
"\"Block-Left SpMM GFLOPS\" , " 261 std::vector<double> perf_scalar, perf_block_left, perf_block_right;
262 for (
Ordinal e=ensemble_min; e<=ensemble_max; e+=ensemble_step) {
264 test_spmm<Scalar,Ordinal,Device>(
265 e, nGrid, nIter, perf_scalar, perf_block_left, perf_block_right );
267 std::cout << nGrid <<
" , " 268 << perf_scalar[0] <<
" , " 269 << perf_scalar[2] <<
" , " 270 << perf_scalar[1] <<
" , " 271 << perf_scalar[3] <<
" , " 272 << perf_scalar[4] / perf_scalar[4] <<
" , " 273 << perf_scalar[4] <<
" , " 274 << perf_block_left[4]/ perf_scalar[4] <<
" , " 275 << perf_block_left[4] <<
" , " size_t generate_fem_graph(size_t N, std::vector< std::vector< size_t > > &graph)
IntType map_fem_graph_coord(const IntType &N, const IntType &i, const IntType &j, const IntType &k)
void test_spmm(const OrdinalType ensemble_length, const OrdinalType nGrid, const OrdinalType iterCount, std::vector< double > &scalar_perf, std::vector< double > &block_left_perf, std::vector< double > &block_right_perf)
Kokkos::DefaultExecutionSpace execution_space
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType * x
void performance_test_driver(const Ordinal nGrid, const Ordinal nIter, const Ordinal ensemble_min, const Ordinal ensemble_max, const Ordinal ensemble_step)
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value >::type spmv(const char mode[], const AlphaType &a, const MatrixType &A, const Kokkos::View< InputType, InputP... > &x, const BetaType &b, const Kokkos::View< OutputType, OutputP... > &y, const RANK_ONE)