44 #ifndef KOKKOS_MEMORYPOOL_HPP 45 #define KOKKOS_MEMORYPOOL_HPP 47 #include <Kokkos_Core_fwd.hpp> 50 #include <impl/Kokkos_BitOps.hpp> 51 #include <impl/Kokkos_Error.hpp> 52 #include <impl/Kokkos_SharedAlloc.hpp> 78 namespace MempoolImpl {
80 template <
typename T,
typename ExecutionSpace >
81 struct initialize_array {
82 typedef ExecutionSpace execution_space;
83 typedef typename ExecutionSpace::size_type size_type;
88 initialize_array( T * d,
size_t size, T v ) : m_data( d ), m_value( v )
92 execution_space::fence();
95 KOKKOS_INLINE_FUNCTION
96 void operator()( size_type i )
const { m_data[i] = m_value; }
99 template <
typename Bitset>
102 typedef typename Bitset::execution_space execution_space;
103 typedef typename execution_space::size_type size_type;
104 typedef typename Bitset::size_type value_type;
105 typedef typename Bitset::word_type word_type;
108 value_type & m_result;
110 bitset_count( word_type * w, value_type num_words, value_type & r )
111 : m_words( w ), m_result( r )
116 KOKKOS_INLINE_FUNCTION
117 void init( value_type & v )
const 120 KOKKOS_INLINE_FUNCTION
121 void join(
volatile value_type & dst,
volatile value_type
const & src )
const 124 KOKKOS_INLINE_FUNCTION
125 void operator()( size_type i, value_type & count )
const 127 count += Kokkos::Impl::bit_count( m_words[i] );
131 template <
typename Device >
134 typedef typename Device::execution_space execution_space;
135 typedef typename Device::memory_space memory_space;
136 typedef unsigned word_type;
137 typedef unsigned size_type;
139 typedef Kokkos::Impl::DeepCopy< memory_space, Kokkos::HostSpace > raw_deep_copy;
144 WORD_SIZE =
sizeof(word_type) * CHAR_BIT,
145 LG_WORD_SIZE = Kokkos::Impl::integral_power_of_two( WORD_SIZE ),
146 WORD_MASK = WORD_SIZE - 1
152 size_type m_num_words;
153 word_type m_last_word_mask;
158 Bitset( Bitset && ) =
default;
159 Bitset(
const Bitset & ) =
default;
160 Bitset & operator = ( Bitset && ) =
default;
161 Bitset & operator = (
const Bitset & ) =
default;
163 void init(
void * w, size_type s )
168 m_words =
reinterpret_cast<word_type*
>( w );
170 m_num_words = ( s + WORD_SIZE - 1 ) >> LG_WORD_SIZE;
171 m_last_word_mask = m_size & WORD_MASK ? ( word_type(1) << ( m_size & WORD_MASK ) ) - 1 : 0;
176 size_type size()
const {
return m_size; }
178 size_type count()
const 181 bitset_count< Bitset > bc( m_words, m_num_words, val );
188 initialize_array< word_type, execution_space > ia( m_words, m_num_words, ~word_type(0) );
190 if ( m_last_word_mask ) {
192 raw_deep_copy( m_words + ( m_num_words - 1 ), &m_last_word_mask,
sizeof(word_type) );
198 initialize_array< word_type, execution_space > ia( m_words, m_num_words, word_type(0) );
201 KOKKOS_FORCEINLINE_FUNCTION
202 bool test( size_type i )
const 204 size_type word_pos = i >> LG_WORD_SIZE;
205 word_type word = volatile_load( &m_words[ word_pos ] );
206 word_type mask = word_type(1) << ( i & WORD_MASK );
211 KOKKOS_FORCEINLINE_FUNCTION
212 bool set( size_type i )
const 214 size_type word_pos = i >> LG_WORD_SIZE;
215 word_type mask = word_type(1) << ( i & WORD_MASK );
217 return !( atomic_fetch_or( &m_words[ word_pos ], mask ) & mask );
220 KOKKOS_FORCEINLINE_FUNCTION
221 bool reset( size_type i )
const 223 size_type word_pos = i >> LG_WORD_SIZE;
224 word_type mask = word_type(1) << ( i & WORD_MASK );
226 return atomic_fetch_and( &m_words[ word_pos ], ~mask ) & mask;
229 KOKKOS_FORCEINLINE_FUNCTION
231 fetch_word_set( size_type i )
const 233 size_type word_pos = i >> LG_WORD_SIZE;
234 word_type mask = word_type(1) << ( i & WORD_MASK );
237 result.
second = atomic_fetch_or( &m_words[ word_pos ], mask );
243 KOKKOS_FORCEINLINE_FUNCTION
245 fetch_word_reset( size_type i )
const 247 size_type word_pos = i >> LG_WORD_SIZE;
248 word_type mask = word_type(1) << ( i & WORD_MASK );
251 result.
second = atomic_fetch_and( &m_words[ word_pos ], ~mask );
257 KOKKOS_FORCEINLINE_FUNCTION
259 set_any_in_word( size_type & pos )
const 261 size_type word_pos = pos >> LG_WORD_SIZE;
262 word_type word = volatile_load( &m_words[ word_pos ] );
267 size_type bit = Kokkos::Impl::bit_scan_forward( ~word );
270 word_type mask = word_type(1) << bit;
271 word = atomic_fetch_or( &m_words[ word_pos ], mask );
273 if ( !( word & mask ) ) {
275 pos = ( word_pos << LG_WORD_SIZE ) + bit;
285 KOKKOS_FORCEINLINE_FUNCTION
287 set_any_in_word( size_type & pos, word_type word_mask )
const 289 size_type word_pos = pos >> LG_WORD_SIZE;
290 word_type word = volatile_load( &m_words[ word_pos ] );
291 word = ( ~word ) & word_mask;
296 size_type bit = Kokkos::Impl::bit_scan_forward( word );
299 word_type mask = word_type(1) << bit;
300 word = atomic_fetch_or( &m_words[ word_pos ], mask );
302 if ( !( word & mask ) ) {
304 pos = ( word_pos << LG_WORD_SIZE ) + bit;
309 word = ( ~word ) & word_mask;
316 KOKKOS_FORCEINLINE_FUNCTION
318 reset_any_in_word( size_type & pos )
const 320 size_type word_pos = pos >> LG_WORD_SIZE;
321 word_type word = volatile_load( &m_words[ word_pos ] );
326 size_type bit = Kokkos::Impl::bit_scan_forward( word );
329 word_type mask = word_type(1) << bit;
330 word = atomic_fetch_and( &m_words[ word_pos ], ~mask );
334 pos = ( word_pos << LG_WORD_SIZE ) + bit;
344 KOKKOS_FORCEINLINE_FUNCTION
346 reset_any_in_word( size_type & pos, word_type word_mask )
const 348 size_type word_pos = pos >> LG_WORD_SIZE;
349 word_type word = volatile_load( &m_words[ word_pos ] );
350 word = word & word_mask;
355 size_type bit = Kokkos::Impl::bit_scan_forward( word );
358 word_type mask = word_type(1) << bit;
359 word = atomic_fetch_and( &m_words[ word_pos ], ~mask );
363 pos = ( word_pos << LG_WORD_SIZE ) + bit;
368 word = word & word_mask;
376 template <
typename UInt32View,
typename BSHeaderView,
typename SBHeaderView,
377 typename MempoolBitset >
378 struct create_histogram {
379 typedef typename UInt32View::execution_space execution_space;
380 typedef typename execution_space::size_type size_type;
384 UInt32View m_page_histogram;
385 BSHeaderView m_blocksize_info;
386 SBHeaderView m_sb_header;
387 MempoolBitset m_sb_blocks;
388 size_t m_lg_max_sb_blocks;
389 uint32_t m_lg_min_block_size;
390 uint32_t m_blocks_per_page;
391 value_type & m_result;
393 create_histogram(
size_t start,
size_t end, UInt32View ph, BSHeaderView bsi,
394 SBHeaderView sbh, MempoolBitset sbb,
size_t lmsb,
395 uint32_t lmbs, uint32_t bpp, value_type & r )
396 : m_start( start ), m_page_histogram( ph ), m_blocksize_info( bsi ),
397 m_sb_header( sbh ), m_sb_blocks( sbb ), m_lg_max_sb_blocks( lmsb ),
398 m_lg_min_block_size( lmbs ), m_blocks_per_page( bpp ), m_result( r )
402 execution_space::fence();
405 KOKKOS_INLINE_FUNCTION
406 void init( value_type & v )
const 412 KOKKOS_INLINE_FUNCTION
413 void join(
volatile value_type & dst,
volatile value_type
const & src )
const 415 dst.first += src.first;
416 dst.second += src.second;
419 KOKKOS_INLINE_FUNCTION
420 void operator()( size_type i, value_type & r )
const 422 size_type i2 = i + m_start;
424 uint32_t lg_block_size = m_sb_header(i2).m_lg_block_size;
427 if ( lg_block_size != 0 ) {
428 uint32_t block_size_id = lg_block_size - m_lg_min_block_size;
429 uint32_t blocks_per_sb = m_blocksize_info[block_size_id].m_blocks_per_sb;
430 uint32_t pages_per_sb = m_blocksize_info[block_size_id].m_pages_per_sb;
432 uint32_t total_allocated_blocks = 0;
434 for ( uint32_t j = 0; j < pages_per_sb; ++j ) {
435 unsigned start_pos = ( i2 << m_lg_max_sb_blocks ) + j * m_blocks_per_page;
436 unsigned end_pos = start_pos + m_blocks_per_page;
437 uint32_t page_allocated_blocks = 0;
439 for (
unsigned k = start_pos; k < end_pos; ++k ) {
440 page_allocated_blocks += m_sb_blocks.test( k );
443 total_allocated_blocks += page_allocated_blocks;
445 atomic_increment( &m_page_histogram(page_allocated_blocks) );
448 r.first += double(total_allocated_blocks) / blocks_per_sb;
449 r.second += blocks_per_sb;
454 #ifdef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO 455 template <
typename UInt32View,
typename SBHeaderView,
typename MempoolBitset >
456 struct count_allocated_blocks {
457 typedef typename UInt32View::execution_space execution_space;
458 typedef typename execution_space::size_type size_type;
460 UInt32View m_num_allocated_blocks;
461 SBHeaderView m_sb_header;
462 MempoolBitset m_sb_blocks;
464 size_t m_lg_max_sb_blocks;
466 count_allocated_blocks(
size_t num_sb, UInt32View nab, SBHeaderView sbh,
467 MempoolBitset sbb,
size_t sbs,
size_t lmsb )
468 : m_num_allocated_blocks( nab ), m_sb_header( sbh ),
469 m_sb_blocks( sbb ), m_sb_size( sbs ), m_lg_max_sb_blocks( lmsb )
473 execution_space::fence();
476 KOKKOS_INLINE_FUNCTION
477 void operator()( size_type i )
const 479 uint32_t lg_block_size = m_sb_header(i).m_lg_block_size;
482 if ( lg_block_size != 0 ) {
484 uint32_t blocks_per_sb = lg_block_size > 0 ? m_sb_size >> lg_block_size : 0;
485 unsigned start_pos = i << m_lg_max_sb_blocks;
486 unsigned end_pos = start_pos + blocks_per_sb;
489 for (
unsigned j = start_pos; j < end_pos; ++j ) {
490 count += m_sb_blocks.test( j );
493 m_num_allocated_blocks(i) = count;
514 template <
typename Device >
564 typedef typename Device::execution_space execution_space;
565 typedef typename Device::memory_space backend_memory_space;
567 typedef MempoolImpl::Bitset< device_type > MempoolBitset;
572 LG_MIN_BLOCK_SIZE = Kokkos::Impl::integral_power_of_two( MIN_BLOCK_SIZE ),
573 MAX_BLOCK_SIZES = 31 - LG_MIN_BLOCK_SIZE + 1,
576 BLOCKS_PER_PAGE = MempoolBitset::WORD_SIZE,
577 LG_BLOCKS_PER_PAGE = MempoolBitset::LG_WORD_SIZE,
579 INVALID_SUPERBLOCK = ~uint32_t(0),
580 SUPERBLOCK_LOCK = ~uint32_t(0) - 1,
588 struct SuperblockHeader {
589 uint32_t m_full_pages;
590 uint32_t m_empty_pages;
591 uint32_t m_lg_block_size;
592 uint32_t m_is_active;
596 m_full_pages(0), m_empty_pages(0), m_lg_block_size(0), m_is_active(
false) {}
600 struct BlockSizeHeader {
601 uint32_t m_blocks_per_sb;
602 uint32_t m_pages_per_sb;
603 uint32_t m_sb_full_level;
604 uint32_t m_page_full_level;
608 m_blocks_per_sb(0), m_pages_per_sb(0), m_sb_full_level(0), m_page_full_level(0) {}
612 typedef Kokkos::Impl::SharedAllocationTracker Tracker;
620 size_t m_lg_max_sb_blocks;
623 size_t m_ceil_num_sb;
628 size_t m_num_block_size;
630 size_t m_sb_blocks_size;
631 size_t m_empty_sb_size;
632 size_t m_partfull_sb_size;
638 MempoolBitset m_sb_blocks;
640 MempoolBitset m_empty_sb;
641 MempoolBitset m_partfull_sb;
643 BlockSizeHeader m_blocksize_info[MAX_BLOCK_SIZES];
673 size_t total_size,
size_t log2_superblock_size = 20 )
674 : m_lg_sb_size( log2_superblock_size ),
675 m_sb_size( size_t(1) << m_lg_sb_size ),
676 m_lg_max_sb_blocks( m_lg_sb_size - LG_MIN_BLOCK_SIZE ),
677 m_num_sb( ( total_size + m_sb_size - 1 ) >> m_lg_sb_size ),
678 m_ceil_num_sb( ( ( m_num_sb + BLOCKS_PER_PAGE - 1 ) >> LG_BLOCKS_PER_PAGE ) <<
679 LG_BLOCKS_PER_PAGE ),
680 m_num_block_size( m_lg_sb_size - LG_MIN_BLOCK_SIZE + 1 ),
681 m_data_size( m_num_sb * m_sb_size ),
682 m_sb_blocks_size( ( m_num_sb << m_lg_max_sb_blocks ) / CHAR_BIT ),
683 m_empty_sb_size( m_ceil_num_sb / CHAR_BIT ),
684 m_partfull_sb_size( m_ceil_num_sb * m_num_block_size / CHAR_BIT ),
685 m_total_size( m_data_size + m_sb_blocks_size + m_empty_sb_size + m_partfull_sb_size ),
687 m_active(
"Active superblocks" ),
688 m_sb_header(
"Superblock headers" ),
692 static_assert( Kokkos::Impl::is_integral_power_of_two( MIN_BLOCK_SIZE ),
"" );
696 if ( m_sb_size < MIN_BLOCK_SIZE * BLOCKS_PER_PAGE ) {
697 printf(
"\n** MemoryPool::MemoryPool() Superblock size must be >= %u **\n",
698 MIN_BLOCK_SIZE * BLOCKS_PER_PAGE );
699 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST 706 if ( m_lg_sb_size > 31 ) {
707 printf(
"\n** MemoryPool::MemoryPool() Superblock size must be < %u **\n",
708 ( uint32_t(1) << 31 ) );
709 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST 718 if ( m_data_size >
size_t(MIN_BLOCK_SIZE) * std::numeric_limits<unsigned>::max() ) {
719 printf(
"\n** MemoryPool::MemoryPool() Allocator can only manage %lu bytes of memory; requested %lu **\n",
720 size_t(MIN_BLOCK_SIZE) * std::numeric_limits<unsigned>::max(), total_size );
721 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST 729 resize( m_active, m_num_block_size );
730 resize( m_sb_header, m_num_sb );
733 typedef Kokkos::Impl::SharedAllocationRecord< backend_memory_space, void > SharedRecord;
735 SharedRecord::allocate( memspace,
"mempool", m_total_size );
737 m_track.assign_allocated_record_to_uninitialized( rec );
738 m_data =
reinterpret_cast<char *
>( rec->data() );
741 m_sb_blocks.init( m_data + m_data_size, m_num_sb << m_lg_max_sb_blocks );
744 m_empty_sb.init( m_data + m_data_size + m_sb_blocks_size, m_num_sb );
750 m_partfull_sb.init( m_data + m_data_size + m_sb_blocks_size + m_empty_sb_size,
751 m_ceil_num_sb * m_num_block_size );
755 for (
size_t i = 0; i < m_num_block_size; ++i ) host_active(i) = INVALID_SUPERBLOCK;
759 const double superblock_full_fraction = .8;
762 const double page_full_fraction = .875;
765 for (
size_t i = 0; i < m_num_block_size; ++i ) {
766 uint32_t lg_block_size = i + LG_MIN_BLOCK_SIZE;
767 uint32_t blocks_per_sb = m_sb_size >> lg_block_size;
768 uint32_t pages_per_sb = ( blocks_per_sb + BLOCKS_PER_PAGE - 1 ) >> LG_BLOCKS_PER_PAGE;
770 m_blocksize_info[i].m_blocks_per_sb = blocks_per_sb;
771 m_blocksize_info[i].m_pages_per_sb = pages_per_sb;
774 m_blocksize_info[i].m_sb_full_level =
775 static_cast<uint32_t
>( pages_per_sb * superblock_full_fraction );
777 if ( m_blocksize_info[i].m_sb_full_level == 0 ) {
778 m_blocksize_info[i].m_sb_full_level = 1;
782 uint32_t blocks_per_page =
783 blocks_per_sb < BLOCKS_PER_PAGE ? blocks_per_sb : BLOCKS_PER_PAGE;
785 m_blocksize_info[i].m_page_full_level =
786 static_cast<uint32_t
>( blocks_per_page * page_full_fraction );
788 if ( m_blocksize_info[i].m_page_full_level == 0 ) {
789 m_blocksize_info[i].m_page_full_level = 1;
793 #ifdef KOKKOS_MEMPOOL_PRINT_CONSTRUCTOR_INFO 795 printf(
" m_lg_sb_size: %12lu\n", m_lg_sb_size );
796 printf(
" m_sb_size: %12lu\n", m_sb_size );
797 printf(
" m_max_sb_blocks: %12lu\n",
size_t(1) << m_lg_max_sb_blocks );
798 printf(
"m_lg_max_sb_blocks: %12lu\n", m_lg_max_sb_blocks );
799 printf(
" m_num_sb: %12lu\n", m_num_sb );
800 printf(
" m_ceil_num_sb: %12lu\n", m_ceil_num_sb );
801 printf(
" m_num_block_size: %12lu\n", m_num_block_size );
802 printf(
" data bytes: %12lu\n", m_data_size );
803 printf(
" sb_blocks bytes: %12lu\n", m_sb_blocks_size );
804 printf(
" empty_sb bytes: %12lu\n", m_empty_sb_size );
805 printf(
" partfull_sb bytes: %12lu\n", m_partfull_sb_size );
806 printf(
" total bytes: %12lu\n", m_total_size );
807 printf(
" m_empty_sb size: %12u\n", m_empty_sb.size() );
808 printf(
"m_partfull_sb size: %12u\n", m_partfull_sb.size() );
813 #ifdef KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO 815 printf(
"SIZE BLOCKS_PER_SB PAGES_PER_SB SB_FULL_LEVEL PAGE_FULL_LEVEL\n" );
816 for (
size_t i = 0; i < m_num_block_size; ++i ) {
817 printf(
"%4zu %13u %12u %13u %15u\n", i + LG_MIN_BLOCK_SIZE,
818 m_blocksize_info[i].m_blocks_per_sb, m_blocksize_info[i].m_pages_per_sb,
819 m_blocksize_info[i].m_sb_full_level, m_blocksize_info[i].m_page_full_level );
826 KOKKOS_INLINE_FUNCTION
828 {
return size_t(1) << ( get_block_size_index( alloc_size ) + LG_MIN_BLOCK_SIZE ); }
842 if ( alloc_size <= m_sb_size )
844 int block_size_id = get_block_size_index( alloc_size );
845 uint32_t blocks_per_sb = m_blocksize_info[block_size_id].m_blocks_per_sb;
846 uint32_t pages_per_sb = m_blocksize_info[block_size_id].m_pages_per_sb;
848 #ifdef KOKKOS_CUDA_CLANG_WORKAROUND 850 if ( pages_per_sb == 0 )
return NULL;
853 unsigned word_size = blocks_per_sb > 32 ? 32 : blocks_per_sb;
854 unsigned word_mask = ( uint64_t(1) << word_size ) - 1;
861 uint32_t sb_id = volatile_load( &m_active(block_size_id) );
865 while ( sb_id == SUPERBLOCK_LOCK ) {
866 sb_id = atomic_fetch_or( &m_active(block_size_id), uint32_t(0) );
871 bool allocation_done =
false;
873 while ( !allocation_done ) {
874 bool need_new_sb =
false;
876 if ( sb_id != INVALID_SUPERBLOCK ) {
878 uint64_t hash_val = get_clock_register();
881 uint32_t pos_base = sb_id << m_lg_max_sb_blocks;
885 uint32_t pos_rel = uint32_t( hash_val & ( pages_per_sb - 1 ) ) << LG_BLOCKS_PER_PAGE;
888 uint32_t pos = pos_base + pos_rel;
893 uint32_t pages_searched = 0;
895 bool search_done =
false;
897 while ( !search_done ) {
898 bool success =
false;
899 unsigned prev_val = 0;
901 Kokkos::tie( success, prev_val ) = m_sb_blocks.set_any_in_word( pos, word_mask );
904 if ( ++pages_searched >= pages_per_sb ) {
920 pos += BLOCKS_PER_PAGE;
921 pos = ( pos < pos_base + blocks_per_sb ) ? pos : pos_base;
929 allocation_done =
true;
931 uint32_t lg_block_size = block_size_id + LG_MIN_BLOCK_SIZE;
933 p = m_data + ( size_t(sb_id) << m_lg_sb_size ) +
934 ( ( pos - pos_base ) << lg_block_size );
936 uint32_t used_bits = Kokkos::Impl::bit_count( prev_val );
938 if ( used_bits == 0 ) {
941 atomic_decrement( &m_sb_header(sb_id).m_empty_pages );
943 else if ( used_bits == m_blocksize_info[block_size_id].m_page_full_level - 1 )
947 uint32_t full_pages = atomic_fetch_add( &m_sb_header(sb_id).m_full_pages, 1 );
950 if ( full_pages == m_blocksize_info[block_size_id].m_sb_full_level - 1 ) {
965 uint32_t new_sb_id = find_superblock( block_size_id, sb_id );
967 if ( new_sb_id == sb_id ) {
968 allocation_done =
true;
969 #ifdef KOKKOS_MEMPOOL_PRINT_INFO 970 printf(
"** No superblocks available. **\n" );
971 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST 982 #ifdef KOKKOS_MEMPOOL_PRINT_INFO 984 printf(
"** Requested allocation size (%zu) larger than superblock size (%lu). **\n",
985 alloc_size, m_sb_size );
986 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST 1002 char * ap =
static_cast<char *
>( alloc_ptr );
1005 if ( ap >= m_data && ap + alloc_size <= m_data + m_data_size ) {
1009 uint32_t sb_id = ( ap - m_data ) >> m_lg_sb_size;
1012 uint32_t pos_base = sb_id << m_lg_max_sb_blocks;
1015 uint32_t offset = ( ap - m_data ) - (
size_t(sb_id) << m_lg_sb_size );
1016 uint32_t lg_block_size = m_sb_header(sb_id).m_lg_block_size;
1017 uint32_t block_size_id = lg_block_size - LG_MIN_BLOCK_SIZE;
1018 uint32_t pos_rel = offset >> lg_block_size;
1020 bool success =
false;
1021 unsigned prev_val = 0;
1025 Kokkos::tie( success, prev_val ) = m_sb_blocks.fetch_word_reset( pos_base + pos_rel );
1029 uint32_t page_fill_level = Kokkos::Impl::bit_count( prev_val );
1031 if ( page_fill_level == 1 ) {
1034 uint32_t empty_pages = atomic_fetch_add( &m_sb_header(sb_id).m_empty_pages, 1 );
1036 if ( !volatile_load( &m_sb_header(sb_id).m_is_active ) &&
1037 empty_pages == m_blocksize_info[block_size_id].m_pages_per_sb - 1 )
1041 unsigned pos = block_size_id * m_ceil_num_sb + sb_id;
1043 if ( m_partfull_sb.reset( pos ) ) {
1045 volatile_store( &m_sb_header(sb_id).m_empty_pages, uint32_t(0) );
1046 volatile_store( &m_sb_header(sb_id).m_lg_block_size, uint32_t(0) );
1050 m_empty_sb.set( sb_id );
1054 else if ( page_fill_level == m_blocksize_info[block_size_id].m_page_full_level ) {
1057 uint32_t full_pages = atomic_fetch_sub( &m_sb_header(sb_id).m_full_pages, 1 );
1059 if ( !volatile_load( &m_sb_header(sb_id).m_is_active ) &&
1060 full_pages == m_blocksize_info[block_size_id].m_sb_full_level )
1065 unsigned pos = block_size_id * m_ceil_num_sb + sb_id;
1066 m_partfull_sb.set( pos );
1071 #ifdef KOKKOS_MEMPOOL_PRINTERR 1073 printf(
"\n** MemoryPool::deallocate() ADDRESS_OUT_OF_RANGE(0x%llx) **\n",
1074 reinterpret_cast<uint64_t>( alloc_ptr ) );
1075 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST 1083 KOKKOS_INLINE_FUNCTION
1091 for (
size_t i = 0; i < m_num_sb; ++i ) {
1092 uint32_t lg_block_size = m_sb_header(i).m_lg_block_size;
1095 if ( lg_block_size == 0 )
return false;
1097 uint32_t block_size_id = lg_block_size - LG_MIN_BLOCK_SIZE;
1098 uint32_t full_pages = volatile_load( &m_sb_header(i).m_full_pages );
1100 if ( full_pages < m_blocksize_info[block_size_id].m_sb_full_level )
return false;
1108 void print_status()
const 1112 #ifdef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO 1114 deep_copy( host_sb_header, m_sb_header );
1116 UInt32View num_allocated_blocks(
"Allocated Blocks", m_num_sb );
1120 MempoolImpl::count_allocated_blocks< UInt32View, SBHeaderView, MempoolBitset >
1121 mch( m_num_sb, num_allocated_blocks, m_sb_header,
1122 m_sb_blocks, m_sb_size, m_lg_max_sb_blocks );
1126 create_mirror_view( num_allocated_blocks );
1127 deep_copy( host_num_allocated_blocks, num_allocated_blocks );
1130 printf(
"SB_ID SIZE ACTIVE EMPTY_PAGES FULL_PAGES USED_BLOCKS\n" );
1131 for (
size_t i = 0; i < m_num_sb; ++i ) {
1132 printf(
"%5zu %4u %6d %11u %10u %10u\n", i,
1133 host_sb_header(i).m_lg_block_size, host_sb_header(i).m_is_active,
1134 host_sb_header(i).m_empty_pages, host_sb_header(i).m_full_pages,
1135 host_num_allocated_blocks(i) );
1141 UInt32View page_histogram(
"Page Histogram", 33 );
1144 typedef View< BlockSizeHeader *, device_type > BSHeaderView;
1145 BSHeaderView blocksize_info(
"BlockSize Headers", MAX_BLOCK_SIZES );
1147 Kokkos::Impl::DeepCopy< backend_memory_space, Kokkos::HostSpace >
1148 dc( blocksize_info.ptr_on_device(), m_blocksize_info,
1149 sizeof(BlockSizeHeader) * m_num_block_size );
1155 MempoolImpl::create_histogram< UInt32View, BSHeaderView, SBHeaderView, MempoolBitset >
1156 mch( 0, m_num_sb, page_histogram, blocksize_info, m_sb_header, m_sb_blocks,
1157 m_lg_max_sb_blocks, LG_MIN_BLOCK_SIZE, BLOCKS_PER_PAGE, result );
1161 deep_copy( host_page_histogram, page_histogram );
1164 uint32_t used_pages = 0;
1165 uint32_t used_blocks = 0;
1166 for ( uint32_t i = 1; i < 33; ++i ) {
1167 used_pages += host_page_histogram(i);
1168 used_blocks += i * host_page_histogram(i);
1170 uint32_t total_pages = used_pages + host_page_histogram(0);
1172 unsigned num_empty_sb = m_empty_sb.count();
1173 unsigned num_non_empty_sb = m_num_sb - num_empty_sb;
1174 unsigned num_partfull_sb = m_partfull_sb.count();
1176 uint32_t total_blocks = result.
second;
1177 double ave_sb_full = num_non_empty_sb == 0 ? 0.0 : result.
first / num_non_empty_sb;
1178 double percent_used_sb = double( m_num_sb - num_empty_sb ) / m_num_sb;
1179 double percent_used_pages = total_pages == 0 ? 0.0 : double(used_pages) / total_pages;
1180 double percent_used_blocks = total_blocks == 0 ? 0.0 : double(used_blocks) / total_blocks;
1186 unsigned num_active_sb = 0;
1187 for (
size_t i = 0; i < m_num_block_size; ++i ) {
1188 num_active_sb += host_active(i) != INVALID_SUPERBLOCK;
1191 #ifdef KOKKOS_MEMPOOL_PRINT_ACTIVE_SUPERBLOCKS 1193 printf(
"BS_ID SB_ID\n" );
1194 for (
size_t i = 0; i < m_num_block_size; ++i ) {
1195 uint32_t sb_id = host_active(i);
1197 if ( sb_id == INVALID_SUPERBLOCK ) {
1198 printf(
"%5zu I\n", i );
1200 else if ( sb_id == SUPERBLOCK_LOCK ) {
1201 printf(
"%5zu L\n", i );
1204 printf(
"%5zu %7u\n", i, sb_id );
1211 #ifdef KOKKOS_MEMPOOL_PRINT_PAGE_INFO 1213 printf(
"USED_BLOCKS PAGE_COUNT\n" );
1214 for ( uint32_t i = 0; i < 33; ++i ) {
1215 printf(
"%10u %10u\n", i, host_page_histogram[i] );
1220 #ifdef KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO 1224 const uint32_t num_sb_id = 1;
1225 uint32_t sb_id[num_sb_id] = { 0 };
1227 for ( uint32_t i = 0; i < num_sb_id; ++i ) {
1231 MempoolImpl::create_histogram< UInt32View, BSHeaderView, SBHeaderView, MempoolBitset >
1232 mch( sb_id[i], sb_id[i] + 1, page_histogram, blocksize_info, m_sb_header,
1233 m_sb_blocks, m_lg_max_sb_blocks, LG_MIN_BLOCK_SIZE, BLOCKS_PER_PAGE, result );
1236 deep_copy( host_page_histogram, page_histogram );
1238 printf(
"SB_ID USED_BLOCKS PAGE_COUNT\n" );
1239 for ( uint32_t j = 0; j < 33; ++j ) {
1240 printf(
"%5u %10u %10u\n", sb_id[i], j, host_page_histogram[j] );
1274 printf(
" Used blocks: %10u / %10u = %10.6lf\n", used_blocks, total_blocks,
1275 percent_used_blocks );
1276 printf(
" Used pages: %10u / %10u = %10.6lf\n", used_pages, total_pages,
1277 percent_used_pages );
1278 printf(
" Used SB: %10zu / %10zu = %10.6lf\n", m_num_sb - num_empty_sb, m_num_sb,
1280 printf(
" Active SB: %10u\n", num_active_sb );
1281 printf(
" Empty SB: %10u\n", num_empty_sb );
1282 printf(
" Partfull SB: %10u\n", num_partfull_sb );
1283 printf(
" Full SB: %10lu\n",
1284 m_num_sb - num_active_sb - num_empty_sb - num_partfull_sb );
1285 printf(
"Ave. SB Full %%: %10.6lf\n", ave_sb_full );
1289 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST 1294 KOKKOS_INLINE_FUNCTION
1295 size_t get_min_block_size()
const {
return MIN_BLOCK_SIZE; }
1297 size_t get_mem_size()
const {
return m_data_size; }
1304 KOKKOS_FORCEINLINE_FUNCTION
1305 int get_block_size_index(
const size_t size )
const 1311 uint32_t first_nonzero_bit =
1312 Kokkos::Impl::bit_scan_reverse( static_cast<unsigned>( size ) );
1318 uint32_t lg2_size = first_nonzero_bit + !Kokkos::Impl::is_integral_power_of_two( size );
1319 lg2_size = lg2_size > LG_MIN_BLOCK_SIZE ? lg2_size : LG_MIN_BLOCK_SIZE;
1323 return lg2_size - LG_MIN_BLOCK_SIZE;
1336 uint32_t find_superblock(
int block_size_id, uint32_t old_sb )
const 1340 Kokkos::atomic_compare_exchange( &m_active(block_size_id), old_sb, SUPERBLOCK_LOCK );
1346 uint32_t new_sb = lock_sb;
1348 if ( lock_sb == old_sb ) {
1354 size_t max_tries = m_ceil_num_sb >> LG_BLOCKS_PER_PAGE;
1356 bool search_done =
false;
1360 unsigned pos = block_size_id * m_ceil_num_sb;
1362 while ( !search_done ) {
1363 bool success =
false;
1364 unsigned prev_val = 0;
1366 Kokkos::tie( success, prev_val ) = m_partfull_sb.reset_any_in_word( pos );
1369 if ( ++tries >= max_tries ) {
1374 pos += BLOCKS_PER_PAGE;
1392 new_sb = pos - block_size_id * m_ceil_num_sb;
1395 volatile_store( &m_sb_header(new_sb).m_is_active, uint32_t(
true) );
1399 if ( lock_sb != INVALID_SUPERBLOCK ) {
1400 volatile_store( &m_sb_header(lock_sb).m_is_active, uint32_t(
false) );
1408 if ( new_sb == lock_sb ) {
1410 search_done =
false;
1416 while ( !search_done ) {
1417 bool success =
false;
1418 unsigned prev_val = 0;
1420 Kokkos::tie( success, prev_val ) = m_empty_sb.reset_any_in_word( pos );
1423 if ( ++tries >= max_tries ) {
1428 pos += BLOCKS_PER_PAGE;
1451 volatile_store( &m_sb_header(new_sb).m_empty_pages,
1452 m_blocksize_info[block_size_id].m_pages_per_sb );
1453 volatile_store( &m_sb_header(new_sb).m_lg_block_size,
1454 block_size_id + LG_MIN_BLOCK_SIZE );
1455 volatile_store( &m_sb_header(new_sb).m_is_active, uint32_t(
true) );
1459 if ( lock_sb != INVALID_SUPERBLOCK ) {
1460 volatile_store( &m_sb_header(lock_sb).m_is_active, uint32_t(
false) );
1469 atomic_exchange( &m_active(block_size_id), new_sb );
1478 new_sb = atomic_fetch_or( &m_active(block_size_id), uint32_t(0) );
1479 }
while ( new_sb == SUPERBLOCK_LOCK );
1487 #ifdef KOKKOS_MEMPOOL_PRINTERR 1488 if ( new_sb == INVALID_SUPERBLOCK ) {
1489 printf(
"\n** MemoryPool::find_superblock() FOUND_INACTIVE_SUPERBLOCK **\n" );
1490 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST 1493 Kokkos::abort(
"" );
1502 KOKKOS_FORCEINLINE_FUNCTION
1503 uint64_t get_clock_register(
void)
const 1505 #if defined( __CUDA_ARCH__ ) 1508 #elif defined( __i386__ ) || defined( __x86_64 ) 1510 unsigned a = 0, d = 0;
1512 __asm__
volatile(
"rdtsc" :
"=a" (a),
"=d" (d) );
1514 return ( (uint64_t) a ) | ( ( (uint64_t) d ) << 32 );
1515 #elif defined( __powerpc ) || defined( __powerpc__ ) || defined( __powerpc64__ ) || \ 1516 defined( __POWERPC__ ) || defined( __ppc__ ) || defined( __ppc64__ ) 1517 unsigned int cycles = 0;
1519 asm volatile(
"mftb %0" :
"=r" (cycles) );
1521 return (uint64_t) cycles;
1523 const uint64_t ticks =
1524 std::chrono::high_resolution_clock::now().time_since_epoch().count();
1534 #ifdef KOKKOS_MEMPOOL_PRINTERR 1535 #undef KOKKOS_MEMPOOL_PRINTERR 1538 #ifdef KOKKOS_MEMPOOL_PRINT_INFO 1539 #undef KOKKOS_MEMPOOL_PRINT_INFO 1542 #ifdef KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO 1543 #undef KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO 1546 #ifdef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO 1547 #undef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO 1550 #ifdef KOKKOS_MEMPOOL_PRINT_PAGE_INFO 1551 #undef KOKKOS_MEMPOOL_PRINT_PAGE_INFO 1554 #ifdef KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO 1555 #undef KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO 1558 #endif // KOKKOS_MEMORYPOOL_HPP
std::enable_if< std::is_same< typename Kokkos::View< T, P... >::array_layout, Kokkos::LayoutLeft >::value||std::is_same< typename Kokkos::View< T, P... >::array_layout, Kokkos::LayoutRight >::value >::type resize(Kokkos::View< T, P... > &v, const size_t n0=0, const size_t n1=0, const size_t n2=0, const size_t n3=0, const size_t n4=0, const size_t n5=0, const size_t n6=0, const size_t n7=0)
Resize a view with copying old data to new data at the corresponding indices.
Bitset based memory manager for pools of same-sized chunks of memory.
void parallel_reduce(const std::string &label, const PolicyType &policy, const FunctorType &functor, ReturnType &return_value, typename Impl::enable_if< Kokkos::Impl::is_execution_policy< PolicyType >::value >::type *=0)
Parallel reduction.
Replacement for std::pair that works on CUDA devices.
KOKKOS_FUNCTION void deallocate(void *alloc_ptr, size_t alloc_size) const
Release allocated memory back to the pool.
first_type first
The first element of the pair.
Memory space for main process and CPU execution spaces.
MemoryPool memory_space
Tag this class as a kokkos memory space.
Declaration of parallel operators.
KOKKOS_INLINE_FUNCTION size_t allocate_block_size(const size_t alloc_size) const
The actual block size allocated given alloc_size.
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type *=0)
Execute functor in parallel according to the execution policy.
KOKKOS_FUNCTION void * allocate(size_t alloc_size) const
Allocate a chunk of memory.
void deep_copy(const View< DT, DP... > &dst, typename ViewTraits< DT, DP... >::const_value_type &value, typename std::enable_if< std::is_same< typename ViewTraits< DT, DP... >::specialize, void >::value >::type *=0)
Deep copy a value from Host memory into a view.
KOKKOS_INLINE_FUNCTION bool is_empty() const
Tests if the memory pool has no more memory available to allocate.
KOKKOS_FORCEINLINE_FUNCTION pair< T1 &, T2 & > tie(T1 &x, T2 &y)
Return a pair of references to the input arguments.
View< typename traits::non_const_data_type, typename traits::array_layout, typename traits::host_mirror_space > HostMirror
Compatible HostMirror view.
second_type second
The second element of the pair.
MemoryPool(const backend_memory_space &memspace, size_t total_size, size_t log2_superblock_size=20)
Initializes the memory pool.