HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_warp_functions.h
1/*
2Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
25
26#if !defined(__HIPCC_RTC__)
27#include "device_library_decls.h" // ockl warp functions
28#endif // !defined(__HIPCC_RTC__)
29
30#if defined(__has_attribute) && __has_attribute(maybe_undef)
31#define MAYBE_UNDEF __attribute__((maybe_undef))
32#else
33#define MAYBE_UNDEF
34#endif
35
36__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
37 union { int i; unsigned u; float f; } tmp; tmp.u = src;
38 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
39 return tmp.u;
40}
41
42__device__ static inline float __hip_ds_bpermutef(int index, float src) {
43 union { int i; unsigned u; float f; } tmp; tmp.f = src;
44 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
45 return tmp.f;
46}
47
48__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
49 union { int i; unsigned u; float f; } tmp; tmp.u = src;
50 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
51 return tmp.u;
52}
53
54__device__ static inline float __hip_ds_permutef(int index, float src) {
55 union { int i; unsigned u; float f; } tmp; tmp.f = src;
56 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
57 return tmp.f;
58}
59
60#define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
61#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
62
63template <int pattern>
64__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
65 union { int i; unsigned u; float f; } tmp; tmp.u = src;
66 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
67 return tmp.u;
68}
69
70template <int pattern>
71__device__ static inline float __hip_ds_swizzlef_N(float src) {
72 union { int i; unsigned u; float f; } tmp; tmp.f = src;
73 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
74 return tmp.f;
75}
76
77#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
78 __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
79
80template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
81__device__ static inline int __hip_move_dpp_N(int src) {
82 return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
83 bound_ctrl);
84}
85
86__device__
87static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
88
89// warp vote function __all __any __ballot
90__device__
91inline
92int __all(int predicate) {
93 return __ockl_wfall_i32(predicate);
94}
95
96__device__
97inline
98int __any(int predicate) {
99 return __ockl_wfany_i32(predicate);
100}
101
102// XXX from llvm/include/llvm/IR/InstrTypes.h
103#define ICMP_NE 33
104
105__device__
106inline
107unsigned long long int __ballot(int predicate) {
108 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
109}
110
111__device__
112inline
113unsigned long long int __ballot64(int predicate) {
114 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
115}
116
117// See amd_warp_sync_functions.h for an explanation of this preprocessor flag.
118#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
119// Since threads in a wave do not make independent progress, __activemask()
120// always returns the exact active mask, i.e, all active threads in the wave.
121__device__
122inline
123unsigned long long __activemask() {
124 return __ballot(true);
125}
126#endif // HIP_ENABLE_WARP_SYNC_BUILTINS
127
128__device__ static inline unsigned int __lane_id() {
129 return __builtin_amdgcn_mbcnt_hi(
130 -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
131}
132
133__device__
134inline
135int __shfl(MAYBE_UNDEF int var, int src_lane, int width = warpSize) {
136 int self = __lane_id();
137 int index = (src_lane & (width - 1)) + (self & ~(width-1));
138 return __builtin_amdgcn_ds_bpermute(index<<2, var);
139}
140__device__
141inline
142unsigned int __shfl(MAYBE_UNDEF unsigned int var, int src_lane, int width = warpSize) {
143 union { int i; unsigned u; float f; } tmp; tmp.u = var;
144 tmp.i = __shfl(tmp.i, src_lane, width);
145 return tmp.u;
146}
147__device__
148inline
149float __shfl(MAYBE_UNDEF float var, int src_lane, int width = warpSize) {
150 union { int i; unsigned u; float f; } tmp; tmp.f = var;
151 tmp.i = __shfl(tmp.i, src_lane, width);
152 return tmp.f;
153}
154__device__
155inline
156double __shfl(MAYBE_UNDEF double var, int src_lane, int width = warpSize) {
157 static_assert(sizeof(double) == 2 * sizeof(int), "");
158 static_assert(sizeof(double) == sizeof(uint64_t), "");
159
160 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
161 tmp[0] = __shfl(tmp[0], src_lane, width);
162 tmp[1] = __shfl(tmp[1], src_lane, width);
163
164 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
165 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
166 return tmp1;
167}
168__device__
169inline
170long __shfl(MAYBE_UNDEF long var, int src_lane, int width = warpSize)
171{
172 #ifndef _MSC_VER
173 static_assert(sizeof(long) == 2 * sizeof(int), "");
174 static_assert(sizeof(long) == sizeof(uint64_t), "");
175
176 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
177 tmp[0] = __shfl(tmp[0], src_lane, width);
178 tmp[1] = __shfl(tmp[1], src_lane, width);
179
180 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
181 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
182 return tmp1;
183 #else
184 static_assert(sizeof(long) == sizeof(int), "");
185 return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
186 #endif
187}
188__device__
189inline
190unsigned long __shfl(MAYBE_UNDEF unsigned long var, int src_lane, int width = warpSize) {
191 #ifndef _MSC_VER
192 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
193 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
194
195 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
196 tmp[0] = __shfl(tmp[0], src_lane, width);
197 tmp[1] = __shfl(tmp[1], src_lane, width);
198
199 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
200 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
201 return tmp1;
202 #else
203 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
204 return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
205 #endif
206}
207__device__
208inline
209long long __shfl(MAYBE_UNDEF long long var, int src_lane, int width = warpSize)
210{
211 static_assert(sizeof(long long) == 2 * sizeof(int), "");
212 static_assert(sizeof(long long) == sizeof(uint64_t), "");
213
214 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
215 tmp[0] = __shfl(tmp[0], src_lane, width);
216 tmp[1] = __shfl(tmp[1], src_lane, width);
217
218 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
219 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
220 return tmp1;
221}
222__device__
223inline
224unsigned long long __shfl(MAYBE_UNDEF unsigned long long var, int src_lane, int width = warpSize) {
225 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
226 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
227
228 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
229 tmp[0] = __shfl(tmp[0], src_lane, width);
230 tmp[1] = __shfl(tmp[1], src_lane, width);
231
232 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
233 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
234 return tmp1;
235}
236
237__device__
238inline
239int __shfl_up(MAYBE_UNDEF int var, unsigned int lane_delta, int width = warpSize) {
240 int self = __lane_id();
241 int index = self - lane_delta;
242 index = (index < (self & ~(width-1)))?self:index;
243 return __builtin_amdgcn_ds_bpermute(index<<2, var);
244}
245__device__
246inline
247unsigned int __shfl_up(MAYBE_UNDEF unsigned int var, unsigned int lane_delta, int width = warpSize) {
248 union { int i; unsigned u; float f; } tmp; tmp.u = var;
249 tmp.i = __shfl_up(tmp.i, lane_delta, width);
250 return tmp.u;
251}
252__device__
253inline
254float __shfl_up(MAYBE_UNDEF float var, unsigned int lane_delta, int width = warpSize) {
255 union { int i; unsigned u; float f; } tmp; tmp.f = var;
256 tmp.i = __shfl_up(tmp.i, lane_delta, width);
257 return tmp.f;
258}
259__device__
260inline
261double __shfl_up(MAYBE_UNDEF double var, unsigned int lane_delta, int width = warpSize) {
262 static_assert(sizeof(double) == 2 * sizeof(int), "");
263 static_assert(sizeof(double) == sizeof(uint64_t), "");
264
265 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
266 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
267 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
268
269 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
270 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
271 return tmp1;
272}
273__device__
274inline
275long __shfl_up(MAYBE_UNDEF long var, unsigned int lane_delta, int width = warpSize)
276{
277 #ifndef _MSC_VER
278 static_assert(sizeof(long) == 2 * sizeof(int), "");
279 static_assert(sizeof(long) == sizeof(uint64_t), "");
280
281 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
282 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
283 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
284
285 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
286 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
287 return tmp1;
288 #else
289 static_assert(sizeof(long) == sizeof(int), "");
290 return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
291 #endif
292}
293
294__device__
295inline
296unsigned long __shfl_up(MAYBE_UNDEF unsigned long var, unsigned int lane_delta, int width = warpSize)
297{
298 #ifndef _MSC_VER
299 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
300 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
301
302 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
303 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
304 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
305
306 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
307 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
308 return tmp1;
309 #else
310 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
311 return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
312 #endif
313}
314
315__device__
316inline
317long long __shfl_up(MAYBE_UNDEF long long var, unsigned int lane_delta, int width = warpSize)
318{
319 static_assert(sizeof(long long) == 2 * sizeof(int), "");
320 static_assert(sizeof(long long) == sizeof(uint64_t), "");
321 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
322 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
323 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
324 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
325 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
326 return tmp1;
327}
328
329__device__
330inline
331unsigned long long __shfl_up(MAYBE_UNDEF unsigned long long var, unsigned int lane_delta, int width = warpSize)
332{
333 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
334 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
335 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
336 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
337 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
338 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
339 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
340 return tmp1;
341}
342
343__device__
344inline
345int __shfl_down(MAYBE_UNDEF int var, unsigned int lane_delta, int width = warpSize) {
346 int self = __lane_id();
347 int index = self + lane_delta;
348 index = (int)((self&(width-1))+lane_delta) >= width?self:index;
349 return __builtin_amdgcn_ds_bpermute(index<<2, var);
350}
351__device__
352inline
353unsigned int __shfl_down(MAYBE_UNDEF unsigned int var, unsigned int lane_delta, int width = warpSize) {
354 union { int i; unsigned u; float f; } tmp; tmp.u = var;
355 tmp.i = __shfl_down(tmp.i, lane_delta, width);
356 return tmp.u;
357}
358__device__
359inline
360float __shfl_down(MAYBE_UNDEF float var, unsigned int lane_delta, int width = warpSize) {
361 union { int i; unsigned u; float f; } tmp; tmp.f = var;
362 tmp.i = __shfl_down(tmp.i, lane_delta, width);
363 return tmp.f;
364}
365__device__
366inline
367double __shfl_down(MAYBE_UNDEF double var, unsigned int lane_delta, int width = warpSize) {
368 static_assert(sizeof(double) == 2 * sizeof(int), "");
369 static_assert(sizeof(double) == sizeof(uint64_t), "");
370
371 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
372 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
373 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
374
375 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
376 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
377 return tmp1;
378}
379__device__
380inline
381long __shfl_down(MAYBE_UNDEF long var, unsigned int lane_delta, int width = warpSize)
382{
383 #ifndef _MSC_VER
384 static_assert(sizeof(long) == 2 * sizeof(int), "");
385 static_assert(sizeof(long) == sizeof(uint64_t), "");
386
387 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
388 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
389 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
390
391 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
392 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
393 return tmp1;
394 #else
395 static_assert(sizeof(long) == sizeof(int), "");
396 return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
397 #endif
398}
399__device__
400inline
401unsigned long __shfl_down(MAYBE_UNDEF unsigned long var, unsigned int lane_delta, int width = warpSize)
402{
403 #ifndef _MSC_VER
404 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
405 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
406
407 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
408 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
409 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
410
411 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
412 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
413 return tmp1;
414 #else
415 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
416 return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
417 #endif
418}
419__device__
420inline
421long long __shfl_down(MAYBE_UNDEF long long var, unsigned int lane_delta, int width = warpSize)
422{
423 static_assert(sizeof(long long) == 2 * sizeof(int), "");
424 static_assert(sizeof(long long) == sizeof(uint64_t), "");
425 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
426 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
427 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
428 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
429 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
430 return tmp1;
431}
432__device__
433inline
434unsigned long long __shfl_down(MAYBE_UNDEF unsigned long long var, unsigned int lane_delta, int width = warpSize)
435{
436 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
437 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
438 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
439 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
440 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
441 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
442 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
443 return tmp1;
444}
445
446__device__
447inline
448int __shfl_xor(MAYBE_UNDEF int var, int lane_mask, int width = warpSize) {
449 int self = __lane_id();
450 int index = self^lane_mask;
451 index = index >= ((self+width)&~(width-1))?self:index;
452 return __builtin_amdgcn_ds_bpermute(index<<2, var);
453}
454__device__
455inline
456unsigned int __shfl_xor(MAYBE_UNDEF unsigned int var, int lane_mask, int width = warpSize) {
457 union { int i; unsigned u; float f; } tmp; tmp.u = var;
458 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
459 return tmp.u;
460}
461__device__
462inline
463float __shfl_xor(MAYBE_UNDEF float var, int lane_mask, int width = warpSize) {
464 union { int i; unsigned u; float f; } tmp; tmp.f = var;
465 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
466 return tmp.f;
467}
468__device__
469inline
470double __shfl_xor(MAYBE_UNDEF double var, int lane_mask, int width = warpSize) {
471 static_assert(sizeof(double) == 2 * sizeof(int), "");
472 static_assert(sizeof(double) == sizeof(uint64_t), "");
473
474 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
475 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
476 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
477
478 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
479 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
480 return tmp1;
481}
482__device__
483inline
484long __shfl_xor(MAYBE_UNDEF long var, int lane_mask, int width = warpSize)
485{
486 #ifndef _MSC_VER
487 static_assert(sizeof(long) == 2 * sizeof(int), "");
488 static_assert(sizeof(long) == sizeof(uint64_t), "");
489
490 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
491 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
492 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
493
494 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
495 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
496 return tmp1;
497 #else
498 static_assert(sizeof(long) == sizeof(int), "");
499 return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
500 #endif
501}
502__device__
503inline
504unsigned long __shfl_xor(MAYBE_UNDEF unsigned long var, int lane_mask, int width = warpSize)
505{
506 #ifndef _MSC_VER
507 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
508 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
509
510 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
511 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
512 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
513
514 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
515 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
516 return tmp1;
517 #else
518 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
519 return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
520 #endif
521}
522__device__
523inline
524long long __shfl_xor(MAYBE_UNDEF long long var, int lane_mask, int width = warpSize)
525{
526 static_assert(sizeof(long long) == 2 * sizeof(int), "");
527 static_assert(sizeof(long long) == sizeof(uint64_t), "");
528 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
529 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
530 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
531 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
532 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
533 return tmp1;
534}
535__device__
536inline
537unsigned long long __shfl_xor(MAYBE_UNDEF unsigned long long var, int lane_mask, int width = warpSize)
538{
539 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
540 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
541 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
542 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
543 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
544 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
545 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
546 return tmp1;
547}
548
549#endif
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...