42 #include "Teuchos_UnitTestHarness.hpp" 43 #include "Teuchos_UnitTestRepository.hpp" 44 #include "Teuchos_GlobalMPISession.hpp" 57 Teuchos::FancyOStream& out) {
58 typedef Kokkos::Cuda Device;
61 typedef typename Storage::template apply_N<VectorSize>::type
storage_type;
65 KokkosSparse::DeviceConfig dev_config(num_blocks, num_vec_threads, num_row_threads);
67 bool success = test_embedded_vector<Vector>(
68 nGrid,
VectorSize, dev_config, MultiplyOp(), out);
75 Kokkos_CrsMatrix_MP, Multiply_Default,
Storage, MultiplyOp )
79 const Ordinal ThreadsPerVector = 16;
82 const Ordinal num_vec_threads = 0;
83 const Ordinal num_row_threads = 0;
86 test_cuda_embedded_vector<Storage,Ordinal,MultiplyOp,NumPerThread,ThreadsPerVector>(num_blocks, num_vec_threads, num_row_threads, out);
90 Kokkos_CrsMatrix_MP, Multiply_1,
Storage, MultiplyOp )
94 const Ordinal ThreadsPerVector = 16;
97 const Ordinal num_vec_threads = ThreadsPerVector;
98 const Ordinal num_row_threads = 4;
101 test_cuda_embedded_vector<Storage,Ordinal,MultiplyOp,NumPerThread,ThreadsPerVector>(num_blocks, num_vec_threads, num_row_threads, out);
105 Kokkos_CrsMatrix_MP, Multiply_2,
Storage, MultiplyOp )
108 const Ordinal NumPerThread = 2;
109 const Ordinal ThreadsPerVector = 16;
112 const Ordinal num_vec_threads = ThreadsPerVector;
113 const Ordinal num_row_threads = 4;
116 test_cuda_embedded_vector<Storage,Ordinal,MultiplyOp,NumPerThread,ThreadsPerVector>(num_blocks, num_vec_threads, num_row_threads, out);
120 Kokkos_CrsMatrix_MP, Multiply_3,
Storage, MultiplyOp )
123 const Ordinal NumPerThread = 3;
124 const Ordinal ThreadsPerVector = 16;
127 const Ordinal num_vec_threads = ThreadsPerVector;
128 const Ordinal num_row_threads = 4;
131 test_cuda_embedded_vector<Storage,Ordinal,MultiplyOp,NumPerThread,ThreadsPerVector>(num_blocks, num_vec_threads, num_row_threads, out);
135 Kokkos_CrsMatrix_MP, Multiply_4,
Storage, MultiplyOp )
138 const Ordinal NumPerThread = 4;
139 const Ordinal ThreadsPerVector = 16;
142 const Ordinal num_vec_threads = ThreadsPerVector;
143 const Ordinal num_row_threads = 4;
146 test_cuda_embedded_vector<Storage,Ordinal,MultiplyOp,NumPerThread,ThreadsPerVector>(num_blocks, num_vec_threads, num_row_threads, out);
149 #define CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_STORAGE_OP( STORAGE, OP ) \ 150 TEUCHOS_UNIT_TEST_TEMPLATE_2_INSTANT( \ 151 Kokkos_CrsMatrix_MP, Multiply_Default, STORAGE, OP ) \ 152 TEUCHOS_UNIT_TEST_TEMPLATE_2_INSTANT( \ 153 Kokkos_CrsMatrix_MP, Multiply_1, STORAGE, OP ) \ 154 TEUCHOS_UNIT_TEST_TEMPLATE_2_INSTANT( \ 155 Kokkos_CrsMatrix_MP, Multiply_2, STORAGE, OP ) \ 156 TEUCHOS_UNIT_TEST_TEMPLATE_2_INSTANT( \ 157 Kokkos_CrsMatrix_MP, Multiply_3, STORAGE, OP ) \ 158 TEUCHOS_UNIT_TEST_TEMPLATE_2_INSTANT( \ 159 Kokkos_CrsMatrix_MP, Multiply_4, STORAGE, OP ) 164 #define CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_ORDINAL_SCALAR_DEVICE( ORDINAL, SCALAR, DEVICE ) \ 165 CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_STORAGE_OP( SFS, DefaultMultiply ) \ 166 CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_STORAGE_OP( SFS, KokkosMultiply ) \ 167 CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_STORAGE_OP( DS, DefaultMultiply ) \ 168 CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_STORAGE_OP( DS, KokkosMultiply ) 173 Teuchos::GlobalMPISession mpiSession(&argc, &
argv);
176 Kokkos::HostSpace::execution_space::initialize();
177 Kokkos::Cuda::initialize(Kokkos::Cuda::SelectDevice(0));
178 Kokkos::Cuda::print_configuration(std::cout);
181 int ret = Teuchos::UnitTestRepository::runUnitTestsFromMain(argc,
argv);
184 Kokkos::HostSpace::execution_space::finalize();
185 Kokkos::Cuda::finalize();
Stokhos::StandardStorage< int, double > storage_type
Stokhos::StandardStorage< int, double > Storage
#define CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_ORDINAL_SCALAR_DEVICE(ORDINAL, SCALAR, DEVICE)
TEUCHOS_UNIT_TEST_TEMPLATE_2_DECL(Kokkos_CrsMatrix_MP, Multiply_Default, Storage, MultiplyOp)
#define CRSMATRIX_MP_VECTOR_TESTS_DEVICE(DEVICE)
bool test_cuda_embedded_vector(Ordinal num_blocks, Ordinal num_vec_threads, Ordinal num_row_threads, Teuchos::FancyOStream &out)
int main(int argc, char *argv[])
const unsigned VectorSize