HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_hip_cooperative_groups.h
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
32#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
33#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
34
35#if __cplusplus
36#if !defined(__HIPCC_RTC__)
38#endif
39
40namespace cooperative_groups {
41
50class thread_group {
51 protected:
52 uint32_t _type; // thread_group type
53 uint32_t _size; // total number of threads in the tread_group
54 uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types,
55 // LSB represents lane 0, and MSB represents lane 63
56
57 // Construct a thread group, and set thread group type and other essential
58 // thread group properties. This generic thread group is directly constructed
59 // only when the group is supposed to contain only the calling the thread
60 // (throurh the API - `this_thread()`), and in all other cases, this thread
61 // group object is a sub-object of some other derived thread group object
62 __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast<uint64_t>(0),
63 uint64_t mask = static_cast<uint64_t>(0)) {
64 _type = type;
65 _size = size;
66 _mask = mask;
67 }
68
69 struct _tiled_info {
70 bool is_tiled;
71 unsigned int size;
72 unsigned int meta_group_rank;
73 unsigned int meta_group_size;
74 };
75
76 struct _coalesced_info {
77 lane_mask member_mask;
78 unsigned int size;
79 struct _tiled_info tiled_info;
80 } coalesced_info;
81
82 friend __CG_QUALIFIER__ thread_group this_thread();
83 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
84 unsigned int tile_size);
85 friend class thread_block;
86
87 public:
88 // Total number of threads in the thread group, and this serves the purpose
89 // for all derived cooperative group types since their `size` is directly
90 // saved during the construction
91 __CG_QUALIFIER__ uint32_t size() const { return _size; }
92 __CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
93 // Rank of the calling thread within [0, size())
94 __CG_QUALIFIER__ uint32_t thread_rank() const;
95 // Is this cooperative group type valid?
96 __CG_QUALIFIER__ bool is_valid() const;
97 // synchronize the threads in the thread group
98 __CG_QUALIFIER__ void sync() const;
99};
123class multi_grid_group : public thread_group {
124 // Only these friend functions are allowed to construct an object of this class
125 // and access its resources
126 friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
127
128 protected:
129 // Construct mutli-grid thread group (through the API this_multi_grid())
130 explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
131 : thread_group(internal::cg_multi_grid, size) {}
132
133 public:
134 // Number of invocations participating in this multi-grid group. In other
135 // words, the number of GPUs
136 __CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); }
137 // Rank of this invocation. In other words, an ID number within the range
138 // [0, num_grids()) of the GPU, this kernel is running on
139 __CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
140 __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
141 __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
142 __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
143};
144
154__CG_QUALIFIER__ multi_grid_group this_multi_grid() {
155 return multi_grid_group(internal::multi_grid::size());
156}
157
166class grid_group : public thread_group {
167 // Only these friend functions are allowed to construct an object of this class
168 // and access its resources
169 friend __CG_QUALIFIER__ grid_group this_grid();
170
171 protected:
172 // Construct grid thread group (through the API this_grid())
173 explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}
174
175 public:
176 __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); }
177 __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
178 __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
179 __CG_QUALIFIER__ dim3 group_dim() const { return internal::workgroup::block_dim(); }
180};
181
191__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }
192
202class thread_block : public thread_group {
203 // Only these friend functions are allowed to construct an object of thi
204 // class and access its resources
205 friend __CG_QUALIFIER__ thread_block this_thread_block();
206 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
207 unsigned int tile_size);
208 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
209 unsigned int tile_size);
210 protected:
211 // Construct a workgroup thread group (through the API this_thread_block())
212 explicit __CG_QUALIFIER__ thread_block(uint32_t size)
213 : thread_group(internal::cg_workgroup, size) {}
214
215 __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
216 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
217 // Invalid tile size, assert
218 if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
219 __hip_assert(false && "invalid tile size");
220 }
221
222 auto block_size = size();
223 auto rank = thread_rank();
224 auto partitions = (block_size + tile_size - 1) / tile_size;
225 auto tail = (partitions * tile_size) - block_size;
226 auto partition_size = tile_size - tail * (rank >= (partitions - 1) * tile_size);
227 thread_group tiledGroup = thread_group(internal::cg_tiled_group, partition_size);
228
229 tiledGroup.coalesced_info.tiled_info.size = tile_size;
230 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
231 tiledGroup.coalesced_info.tiled_info.meta_group_rank = rank / tile_size;
232 tiledGroup.coalesced_info.tiled_info.meta_group_size = partitions;
233 return tiledGroup;
234 }
235
236 public:
237 // 3-dimensional block index within the grid
238 __CG_STATIC_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
239 // 3-dimensional thread index within the block
240 __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
241 __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return internal::workgroup::thread_rank(); }
242 __CG_STATIC_QUALIFIER__ uint32_t size() { return internal::workgroup::size(); }
243 __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
244 __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
245 __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
246};
247
257__CG_QUALIFIER__ thread_block this_thread_block() {
258 return thread_block(internal::workgroup::size());
259}
260
268
269class tiled_group : public thread_group {
270 private:
271 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
272 unsigned int tile_size);
273 friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent,
274 unsigned int tile_size);
275
276 __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
277 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
278
279 if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
280 __hip_assert(false && "invalid tile size");
281 }
282
283 if (size() <= tile_size) {
284 return *this;
285 }
286
287 tiled_group tiledGroup = tiled_group(tile_size);
288 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
289 return tiledGroup;
290 }
291
292 protected:
293 explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
294 : thread_group(internal::cg_tiled_group, tileSize) {
295 coalesced_info.tiled_info.size = tileSize;
296 coalesced_info.tiled_info.is_tiled = true;
297 }
298
299 public:
300 __CG_QUALIFIER__ unsigned int size() const { return (coalesced_info.tiled_info.size); }
301
302 __CG_QUALIFIER__ unsigned int thread_rank() const {
303 return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1));
304 }
305
306 __CG_QUALIFIER__ void sync() const {
307 internal::tiled_group::sync();
308 }
309};
310
311template <unsigned int size, class ParentCGTy> class thread_block_tile;
312
320class coalesced_group : public thread_group {
321 private:
322 friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
323 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size);
324 friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size);
325 friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred);
326 template <unsigned int fsize, class fparent>
327 friend __CG_QUALIFIER__ coalesced_group
328 binary_partition(const thread_block_tile<fsize, fparent>& tgrp, bool pred);
329
330 __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const {
331 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
332
333 if (!tile_size || !pow2) {
334 return coalesced_group(0);
335 }
336
337 // If a tiled group is passed to be partitioned further into a coalesced_group.
338 // prepare a mask for further partitioning it so that it stays coalesced.
339 if (coalesced_info.tiled_info.is_tiled) {
340 unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
341 unsigned int masklength = min(static_cast<unsigned int>(size()) - base_offset, tile_size);
342 lane_mask member_mask = static_cast<lane_mask>(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);
343
344 member_mask <<= (__lane_id() & ~(tile_size - 1));
345 coalesced_group coalesced_tile = coalesced_group(member_mask);
346 coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
347 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
348 coalesced_tile.coalesced_info.tiled_info.meta_group_size = size() / tile_size;
349 return coalesced_tile;
350 }
351 // Here the parent coalesced_group is not partitioned.
352 else {
353 lane_mask member_mask = 0;
354 unsigned int tile_rank = 0;
355 int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;
356
357 for (unsigned int i = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) {
358 lane_mask active = coalesced_info.member_mask & (1 << i);
359 // Make sure the lane is active
360 if (active) {
361 if (lanes_to_skip <= 0 && tile_rank < tile_size) {
362 // Prepare a member_mask that is appropriate for a tile
363 member_mask |= active;
364 tile_rank++;
365 }
366 lanes_to_skip--;
367 }
368 }
369 coalesced_group coalesced_tile = coalesced_group(member_mask);
370 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
371 coalesced_tile.coalesced_info.tiled_info.meta_group_size =
372 (size() + tile_size - 1) / tile_size;
373 return coalesced_tile;
374 }
375 return coalesced_group(0);
376 }
377
378 protected:
379 // Constructor
380 explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
381 : thread_group(internal::cg_coalesced_group) {
382 coalesced_info.member_mask = member_mask; // Which threads are active
383 coalesced_info.size = __popcll(coalesced_info.member_mask); // How many threads are active
384 coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
385 coalesced_info.tiled_info.meta_group_rank = 0;
386 coalesced_info.tiled_info.meta_group_size = 1;
387 }
388
389 public:
390 __CG_QUALIFIER__ unsigned int size() const {
391 return coalesced_info.size;
392 }
393
394 __CG_QUALIFIER__ unsigned int thread_rank() const {
395 return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
396 }
397
398 __CG_QUALIFIER__ void sync() const {
399 internal::coalesced_group::sync();
400 }
401
402 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
403 return coalesced_info.tiled_info.meta_group_rank;
404 }
405
406 __CG_QUALIFIER__ unsigned int meta_group_size() const {
407 return coalesced_info.tiled_info.meta_group_size;
408 }
409
410 template <class T>
411 __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
412 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
413
414 srcRank = srcRank % static_cast<int>(size());
415
416 int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank
417 : (__AMDGCN_WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
418 : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
419
420 return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
421 }
422
423 template <class T>
424 __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
425 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
426
427 // Note: The cuda implementation appears to use the remainder of lane_delta
428 // and WARP_SIZE as the shift value rather than lane_delta itself.
429 // This is not described in the documentation and is not done here.
430
431 if (size() == __AMDGCN_WAVEFRONT_SIZE) {
432 return __shfl_down(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
433 }
434
435 int lane;
436 if (__AMDGCN_WAVEFRONT_SIZE == 64) {
437 lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
438 }
439 else {
440 lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
441 }
442
443 if (lane == -1) {
444 lane = __lane_id();
445 }
446
447 return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
448 }
449
450 template <class T>
451 __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
452 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
453
454 // Note: The cuda implementation appears to use the remainder of lane_delta
455 // and WARP_SIZE as the shift value rather than lane_delta itself.
456 // This is not described in the documentation and is not done here.
457
458 if (size() == __AMDGCN_WAVEFRONT_SIZE) {
459 return __shfl_up(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
460 }
461
462 int lane;
463 if (__AMDGCN_WAVEFRONT_SIZE == 64) {
464 lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
465 }
466 else if (__AMDGCN_WAVEFRONT_SIZE == 32) {
467 lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
468 }
469
470 if (lane == -1) {
471 lane = __lane_id();
472 }
473
474 return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
475 }
476#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
477 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
478 return internal::helper::adjust_mask(
479 coalesced_info.member_mask,
480 __ballot_sync<unsigned long long>(coalesced_info.member_mask, pred));
481 }
482
483 __CG_QUALIFIER__ int any(int pred) const {
484 return __any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
485 }
486
487 __CG_QUALIFIER__ int all(int pred) const {
488 return __all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
489 }
490
491 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
492 return internal::helper::adjust_mask(
493 coalesced_info.member_mask,
494 __match_any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value));
495 }
496
497 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
498 return internal::helper::adjust_mask(
499 coalesced_info.member_mask,
500 __match_all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value,
501 &pred));
502 }
503#endif
504};
505
512
513__CG_QUALIFIER__ coalesced_group coalesced_threads() {
514 return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
515}
516
522__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
523 switch (this->_type) {
524 case internal::cg_multi_grid: {
525 return (static_cast<const multi_grid_group*>(this)->thread_rank());
526 }
527 case internal::cg_grid: {
528 return (static_cast<const grid_group*>(this)->thread_rank());
529 }
530 case internal::cg_workgroup: {
531 return (static_cast<const thread_block*>(this)->thread_rank());
532 }
533 case internal::cg_tiled_group: {
534 return (static_cast<const tiled_group*>(this)->thread_rank());
535 }
536 case internal::cg_coalesced_group: {
537 return (static_cast<const coalesced_group*>(this)->thread_rank());
538 }
539 default: {
540 __hip_assert(false && "invalid cooperative group type");
541 return -1;
542 }
543 }
544}
550__CG_QUALIFIER__ bool thread_group::is_valid() const {
551 switch (this->_type) {
552 case internal::cg_multi_grid: {
553 return (static_cast<const multi_grid_group*>(this)->is_valid());
554 }
555 case internal::cg_grid: {
556 return (static_cast<const grid_group*>(this)->is_valid());
557 }
558 case internal::cg_workgroup: {
559 return (static_cast<const thread_block*>(this)->is_valid());
560 }
561 case internal::cg_tiled_group: {
562 return (static_cast<const tiled_group*>(this)->is_valid());
563 }
564 case internal::cg_coalesced_group: {
565 return (static_cast<const coalesced_group*>(this)->is_valid());
566 }
567 default: {
568 __hip_assert(false && "invalid cooperative group type");
569 return false;
570 }
571 }
572}
578__CG_QUALIFIER__ void thread_group::sync() const {
579 switch (this->_type) {
580 case internal::cg_multi_grid: {
581 static_cast<const multi_grid_group*>(this)->sync();
582 break;
583 }
584 case internal::cg_grid: {
585 static_cast<const grid_group*>(this)->sync();
586 break;
587 }
588 case internal::cg_workgroup: {
589 static_cast<const thread_block*>(this)->sync();
590 break;
591 }
592 case internal::cg_tiled_group: {
593 static_cast<const tiled_group*>(this)->sync();
594 break;
595 }
596 case internal::cg_coalesced_group: {
597 static_cast<const coalesced_group*>(this)->sync();
598 break;
599 }
600 default: {
601 __hip_assert(false && "invalid cooperative group type");
602 }
603 }
604}
605
612template <class CGTy> __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); }
619template <class CGTy> __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) {
620 return g.thread_rank();
621}
628template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
635template <class CGTy> __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); }
641template <unsigned int tileSize> class tile_base {
642 protected:
643 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
644
645 public:
646 // Rank of the thread within this tile
647 _CG_STATIC_CONST_DECL_ unsigned int thread_rank() {
648 return (internal::workgroup::thread_rank() & (numThreads - 1));
649 }
650
651 // Number of threads within this tile
652 __CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; }
653};
659template <unsigned int size> class thread_block_tile_base : public tile_base<size> {
660 static_assert(is_valid_tile_size<size>::value,
661 "Tile size is either not a power of 2 or greater than the wavefront size");
662 using tile_base<size>::numThreads;
663
664 template <unsigned int fsize, class fparent>
665 friend __CG_QUALIFIER__ coalesced_group
666 binary_partition(const thread_block_tile<fsize, fparent>& tgrp, bool pred);
667
668#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
669 __CG_QUALIFIER__ unsigned long long build_mask() const {
670 unsigned long long mask = ~0ull >> (64 - numThreads);
671 // thread_rank() gives thread id from 0..thread launch size.
672 return mask << (((internal::workgroup::thread_rank() % __AMDGCN_WAVEFRONT_SIZE) / numThreads) *
673 numThreads);
674 }
675#endif
676
677 public:
678 __CG_STATIC_QUALIFIER__ void sync() {
679 internal::tiled_group::sync();
680 }
681
682 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
683 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
684 return (__shfl(var, srcRank, numThreads));
685 }
686
687 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
688 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
689 return (__shfl_down(var, lane_delta, numThreads));
690 }
691
692 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
693 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
694 return (__shfl_up(var, lane_delta, numThreads));
695 }
696
697 template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
698 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
699 return (__shfl_xor(var, laneMask, numThreads));
700 }
701
702#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
703 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
704 const auto mask = build_mask();
705 return internal::helper::adjust_mask(mask, __ballot_sync(mask, pred));
706 }
707
708 __CG_QUALIFIER__ int any(int pred) const { return __any_sync(build_mask(), pred); }
709
710 __CG_QUALIFIER__ int all(int pred) const { return __all_sync(build_mask(), pred); }
711
712 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
713 const auto mask = build_mask();
714 return internal::helper::adjust_mask(mask, __match_any_sync(mask, value));
715 }
716
717 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
718 const auto mask = build_mask();
719 return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred));
720 }
721#endif
722};
723
726template <unsigned int tileSize, typename ParentCGTy>
727class parent_group_info {
728public:
729 // Returns the linear rank of the group within the set of tiles partitioned
730 // from a parent group (bounded by meta_group_size)
731 __CG_STATIC_QUALIFIER__ unsigned int meta_group_rank() {
732 return ParentCGTy::thread_rank() / tileSize;
733 }
734
735 // Returns the number of groups created when the parent group was partitioned.
736 __CG_STATIC_QUALIFIER__ unsigned int meta_group_size() {
737 return (ParentCGTy::size() + tileSize - 1) / tileSize;
738 }
739};
740
747template <unsigned int tileSize, class ParentCGTy>
748class thread_block_tile_type : public thread_block_tile_base<tileSize>,
749 public tiled_group,
750 public parent_group_info<tileSize, ParentCGTy> {
751 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
752 typedef thread_block_tile_base<numThreads> tbtBase;
753 protected:
754 __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
755 coalesced_info.tiled_info.size = numThreads;
756 coalesced_info.tiled_info.is_tiled = true;
757 }
758 public:
759 using tbtBase::size;
760 using tbtBase::sync;
761 using tbtBase::thread_rank;
762};
763
764// Partial template specialization
765template <unsigned int tileSize>
766class thread_block_tile_type<tileSize, void> : public thread_block_tile_base<tileSize>,
767 public tiled_group
768 {
769 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
770
771 typedef thread_block_tile_base<numThreads> tbtBase;
772
773 protected:
774
775 __CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
776 : tiled_group(numThreads) {
777 coalesced_info.tiled_info.size = numThreads;
778 coalesced_info.tiled_info.is_tiled = true;
779 coalesced_info.tiled_info.meta_group_rank = meta_group_rank;
780 coalesced_info.tiled_info.meta_group_size = meta_group_size;
781 }
782
783 public:
784 using tbtBase::size;
785 using tbtBase::sync;
786 using tbtBase::thread_rank;
787
788 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
789 return coalesced_info.tiled_info.meta_group_rank;
790 }
791
792 __CG_QUALIFIER__ unsigned int meta_group_size() const {
793 return coalesced_info.tiled_info.meta_group_size;
794 }
795// end of operative group
799};
800
801__CG_QUALIFIER__ thread_group this_thread() {
802 thread_group g(internal::group_type::cg_coalesced_group, 1, __ockl_activelane_u32());
803 return g;
804}
805
811
812__CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) {
813 if (parent.cg_type() == internal::cg_tiled_group) {
814 const tiled_group* cg = static_cast<const tiled_group*>(&parent);
815 return cg->new_tiled_group(tile_size);
816 }
817 else if(parent.cg_type() == internal::cg_coalesced_group) {
818 const coalesced_group* cg = static_cast<const coalesced_group*>(&parent);
819 return cg->new_tiled_group(tile_size);
820 }
821 else {
822 const thread_block* tb = static_cast<const thread_block*>(&parent);
823 return tb->new_tiled_group(tile_size);
824 }
825}
826
827// Thread block type overload
828__CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) {
829 return (parent.new_tiled_group(tile_size));
830}
831
832__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) {
833 return (parent.new_tiled_group(tile_size));
834}
835
836// If a coalesced group is passed to be partitioned, it should remain coalesced
837__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size) {
838 return (parent.new_tiled_group(tile_size));
839}
840
841namespace impl {
842template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
843
844template <unsigned int size, class ParentCGTy>
845class thread_block_tile_internal : public thread_block_tile_type<size, ParentCGTy> {
846 protected:
847 template <unsigned int tbtSize, class tbtParentT>
848 __CG_QUALIFIER__ thread_block_tile_internal(
849 const thread_block_tile_internal<tbtSize, tbtParentT>& g)
850 : thread_block_tile_type<size, ParentCGTy>(g.meta_group_rank(), g.meta_group_size()) {}
851
852 __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
853 : thread_block_tile_type<size, ParentCGTy>() {}
854};
855} // namespace impl
856
857template <unsigned int size, class ParentCGTy>
858class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCGTy> {
859 protected:
860 __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
861 : impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
862
863 public:
864 __CG_QUALIFIER__ operator thread_block_tile<size, void>() const {
865 return thread_block_tile<size, void>(*this);
866 }
867};
868
869
870template <unsigned int size>
871class thread_block_tile<size, void> : public impl::thread_block_tile_internal<size, void> {
872 template <unsigned int, class ParentCGTy> friend class thread_block_tile;
873
874 protected:
875 public:
876 template <class ParentCGTy>
877 __CG_QUALIFIER__ thread_block_tile(const thread_block_tile<size, ParentCGTy>& g)
878 : impl::thread_block_tile_internal<size, void>(g) {}
879};
880
881template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
882
883namespace impl {
884template <unsigned int size, class ParentCGTy> struct tiled_partition_internal;
885
886template <unsigned int size>
887struct tiled_partition_internal<size, thread_block> : public thread_block_tile<size, thread_block> {
888 __CG_QUALIFIER__ tiled_partition_internal(const thread_block& g)
889 : thread_block_tile<size, thread_block>(g) {}
890};
891
892} // namespace impl
893
899template <unsigned int size, class ParentCGTy>
900__CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const ParentCGTy& g) {
901 static_assert(is_valid_tile_size<size>::value,
902 "Tiled partition with size > wavefront size. Currently not supported ");
903 return impl::tiled_partition_internal<size, ParentCGTy>(g);
904}
905
906#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
911__CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred) {
912 auto mask = __ballot_sync<unsigned long long>(cgrp.coalesced_info.member_mask, pred);
913
914 if (pred) {
915 return coalesced_group(mask);
916 } else {
917 return coalesced_group(cgrp.coalesced_info.member_mask ^ mask);
918 }
919}
920
921template <unsigned int size, class parent>
922__CG_QUALIFIER__ coalesced_group binary_partition(const thread_block_tile<size, parent>& tgrp,
923 bool pred) {
924 auto mask = __ballot_sync<unsigned long long>(tgrp.build_mask(), pred);
925
926 if (pred) {
927 return coalesced_group(mask);
928 } else {
929 return coalesced_group(tgrp.build_mask() ^ mask);
930 }
931}
932#endif
933} // namespace cooperative_groups
934
935#endif // __cplusplus
936#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
Device side implementation of cooperative group feature.