108#ifndef _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BF16_H_
109#define _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BF16_H_
111#if !defined(__HIPCC_RTC__)
112#include <hip/amd_detail/amd_hip_common.h>
113#include <hip/amd_detail/amd_warp_functions.h>
114#include "amd_hip_vector_types.h"
116#if defined(__clang__) && defined(__HIP__)
117#include "amd_hip_atomic.h"
122#define __BF16_DEVICE__ __device__
123#if defined(__HIPCC_RTC__)
124#define __BF16_HOST_DEVICE__ __BF16_DEVICE__
129#define __BF16_HOST_DEVICE__ __host__ __BF16_DEVICE__
131#define __BF16_DEVICE_STATIC__ __BF16_DEVICE__ static inline
132#define __BF16_HOST_DEVICE_STATIC__ __BF16_HOST_DEVICE__ static inline
134#if defined(__AVX512VL__) and defined(__AVX512BF16__) and not defined(__HIP_DEVICE_COMPILE__)
136#if defined(__MINGW64__)
139#include <immintrin.h>
141#define HIP_BF16_AVX512_OP 1
142static_assert(
sizeof(__bf16) ==
sizeof(
unsigned short),
143 "sizeof __bf16 should match sizeof unsigned short");
145#define HIP_BF16_AVX512_OP 0
148#define HIPRT_ONE_BF16 __ushort_as_bfloat16((unsigned short)0x3F80U)
149#define HIPRT_ZERO_BF16 __ushort_as_bfloat16((unsigned short)0x0000U)
150#define HIPRT_INF_BF16 __ushort_as_bfloat16((unsigned short)0x7F80U)
151#define HIPRT_MAX_NORMAL_BF16 __ushort_as_bfloat16((unsigned short)0x7F7FU)
152#define HIPRT_MIN_DENORM_BF16 __ushort_as_bfloat16((unsigned short)0x0001U)
153#define HIPRT_NAN_BF16 __ushort_as_bfloat16((unsigned short)0x7FFFU)
154#define HIPRT_NEG_ZERO_BF16 __ushort_as_bfloat16((unsigned short)0x8000U)
159#if !defined(__HIPCC_RTC__)
160static_assert(CHAR_BIT == 8,
"byte size should be of 8 bits");
162static_assert(
sizeof(
unsigned short) == 2,
"size of unsigned short should be 2 bytes");
179} __hip_bfloat162_raw;
189 __BF16_HOST_DEVICE_STATIC__
float bfloatraw_2_float(
unsigned short val) {
190#if HIP_BF16_AVX512_OP
195 return _mm_cvtsbh_ss(u.bf16);
197 unsigned int uval = val << 16;
206 __BF16_HOST_DEVICE_STATIC__
unsigned short float_2_bfloatraw(
float f) {
207#if HIP_BF16_AVX512_OP
211 } u = {_mm_cvtness_sbh(f)};
218 if (~u.u32 & 0x7f800000) {
235 u.u32 += 0x7fff + ((u.u32 >> 16) & 1);
236 }
else if (u.u32 & 0xffff) {
247 return static_cast<unsigned short>(u.u32 >> 16);
251 __BF16_HOST_DEVICE_STATIC__
unsigned short double_2_bfloatraw(
double d_in) {
255 } u = {
static_cast<float>(d_in)};
259 if ((d_in > 0.0 && d > d_in) || (d_in < 0.0 && d < d_in)) {
264 return float_2_bfloatraw(u.fp32);
281 __BF16_HOST_DEVICE__ __hip_bfloat16(
unsigned int val)
282 : __x(double_2_bfloatraw(static_cast<double>(val))) {}
285 __BF16_HOST_DEVICE__ __hip_bfloat16(
int val)
286 : __x(double_2_bfloatraw(static_cast<double>(val))) {}
289 __BF16_HOST_DEVICE__ __hip_bfloat16(
unsigned short val)
290 : __x(float_2_bfloatraw(static_cast<float>(val))) {}
293 __BF16_HOST_DEVICE__ __hip_bfloat16(
short val)
294 : __x(float_2_bfloatraw(static_cast<float>(val))) {}
297 __BF16_HOST_DEVICE__ __hip_bfloat16(
const double val) : __x(double_2_bfloatraw(val)) {}
300 __BF16_HOST_DEVICE__ __hip_bfloat16(
const float val) : __x(float_2_bfloatraw(val)) {}
303 __BF16_HOST_DEVICE__ __hip_bfloat16(
const __hip_bfloat16_raw& val) : __x(val.x) {}
306 __BF16_HOST_DEVICE__ __hip_bfloat16() =
default;
309 __BF16_HOST_DEVICE__
operator __hip_bfloat16_raw()
const {
return __hip_bfloat16_raw{__x}; }
312 __BF16_HOST_DEVICE__
operator __hip_bfloat16_raw()
const volatile {
313 return __hip_bfloat16_raw{__x};
317 __BF16_HOST_DEVICE__
operator bool()
const {
318 auto val = bfloatraw_2_float(__x);
319 return val != 0.0f && val != -0.0f;
323 __BF16_HOST_DEVICE__
operator char()
const {
return static_cast<char>(bfloatraw_2_float(__x)); }
326 __BF16_HOST_DEVICE__
operator float()
const {
return bfloatraw_2_float(__x); }
329 __BF16_HOST_DEVICE__
operator int()
const {
return static_cast<int>(bfloatraw_2_float(__x)); }
332 __BF16_HOST_DEVICE__
operator long()
const {
return static_cast<long>(bfloatraw_2_float(__x)); }
335 __BF16_HOST_DEVICE__
operator long long()
const {
336 return static_cast<long long>(bfloatraw_2_float(__x));
340 __BF16_HOST_DEVICE__
operator short()
const {
return static_cast<short>(bfloatraw_2_float(__x)); }
343 __BF16_HOST_DEVICE__
operator signed char()
const {
344 return static_cast<signed char>(bfloatraw_2_float(__x));
348 __BF16_HOST_DEVICE__
operator unsigned char()
const {
349 return static_cast<unsigned char>(bfloatraw_2_float(__x));
353 __BF16_HOST_DEVICE__
operator unsigned int()
const {
354 return static_cast<unsigned int>(bfloatraw_2_float(__x));
358 __BF16_HOST_DEVICE__
operator unsigned long()
const {
359 return static_cast<unsigned long>(bfloatraw_2_float(__x));
363 __BF16_HOST_DEVICE__
operator unsigned long long()
const {
364 return static_cast<unsigned long long>(bfloatraw_2_float(__x));
368 __BF16_HOST_DEVICE__
operator unsigned short()
const {
369 return static_cast<unsigned short>(bfloatraw_2_float(__x));
375 __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(
unsigned int val) {
376 __x = float_2_bfloatraw(
static_cast<float>(val));
381 __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(
int val) {
382 __x = float_2_bfloatraw(
static_cast<float>(val));
387 __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(
unsigned short val) {
388 __x = float_2_bfloatraw(
static_cast<float>(val));
393 __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(
short val) {
394 __x = float_2_bfloatraw(
static_cast<float>(val));
399 __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(
const double f) {
400 __x = float_2_bfloatraw(
static_cast<float>(f));
405 __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(
const float f) {
406 __x = float_2_bfloatraw(f);
411 __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(
const __hip_bfloat16_raw& hr) {
417 __BF16_HOST_DEVICE__
volatile __hip_bfloat16& operator=(
const __hip_bfloat16_raw& hr)
volatile {
423 __BF16_HOST_DEVICE__
volatile __hip_bfloat16& operator=(
424 const volatile __hip_bfloat16_raw& hr)
volatile {
445 __BF16_HOST_DEVICE__ __hip_bfloat162(
const __hip_bfloat162_raw& h2r)
446 : x(__hip_bfloat16(__hip_bfloat16_raw{h2r.x})),
447 y(__hip_bfloat16(__hip_bfloat16_raw{h2r.y})) {}
450 __BF16_HOST_DEVICE__ __hip_bfloat162(
const __hip_bfloat162& val) : x(val.x), y(val.y) {}
453 __BF16_HOST_DEVICE__ __hip_bfloat162(
const __hip_bfloat16& a,
const __hip_bfloat16& b)
457 __BF16_HOST_DEVICE__ __hip_bfloat162() =
default;
460 __BF16_HOST_DEVICE__
operator __hip_bfloat162_raw()
const {
461 __hip_bfloat16_raw l = x;
462 __hip_bfloat16_raw r = y;
463 return __hip_bfloat162_raw{l.x, r.x};
467 __BF16_HOST_DEVICE__
operator float2()
const {
468#if HIP_BF16_AVX512_OP
470 __hip_bfloat162_raw raw2;
472 static_assert(
sizeof(__bf16[2]) ==
sizeof(__hip_bfloat162_raw));
475 __m128bh pbf16{u.bf162[0], u.bf162[1], 0, 0};
476 __m128 pf32 = _mm_cvtpbh_ps(pbf16);
477 float2 ret(pf32[0], pf32[1]);
485 __BF16_HOST_DEVICE__ __hip_bfloat162& operator=(
const __hip_bfloat162_raw& h2r) {
486 x = __hip_bfloat16(__hip_bfloat16_raw{h2r.x});
487 y = __hip_bfloat16(__hip_bfloat16_raw{h2r.y});
492 __BF16_HOST_DEVICE__ __hip_bfloat162& operator=(
const __hip_bfloat162& src) {
514 __hip_bfloat16 ret{f};
532 return __hip_bfloat162(a, a);
540 static_assert(
sizeof(__hip_bfloat16) ==
sizeof(
short int));
553 static_assert(
sizeof(__hip_bfloat16) ==
sizeof(
unsigned short int));
556 unsigned short int usi;
566 __hip_bfloat16 ret{a};
583 const __hip_bfloat16 b) {
584 return __hip_bfloat162(a, b);
591__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
__high2bfloat16(
const __hip_bfloat162 a) {
return a.y; }
598 return __hip_bfloat162(a.y, a.y);
605__BF16_HOST_DEVICE_STATIC__
float __high2float(
const __hip_bfloat162 a) {
614 const __hip_bfloat162 b) {
615 return __hip_bfloat162(a.y, b.y);
622__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
__low2bfloat16(
const __hip_bfloat162 a) {
return a.x; }
629 return __hip_bfloat162(a.x, a.x);
636__BF16_HOST_DEVICE_STATIC__
float __low2float(
const __hip_bfloat162 a) {
645 return __hip_bfloat162(a.y, a.x);
653 const __hip_bfloat162 b) {
654 return __hip_bfloat162(a.x, b.x);
662 static_assert(
sizeof(__hip_bfloat16) ==
sizeof(
short int));
675 static_assert(
sizeof(__hip_bfloat16) ==
sizeof(
unsigned short int));
677 unsigned short int usi;
683#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
688__BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_down_sync(
const unsigned long long mask,
689 const __hip_bfloat16 in,
690 const unsigned int delta,
691 const int width = __AMDGCN_WAVEFRONT_SIZE) {
699__BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_down_sync(
const unsigned long long mask,
700 const __hip_bfloat162 in,
701 const unsigned int delta,
702 const int width = __AMDGCN_WAVEFRONT_SIZE) {
703 static_assert(
sizeof(__hip_bfloat162) ==
sizeof(
unsigned int));
705 __hip_bfloat162 bf162;
708 u.ui = __shfl_down_sync<unsigned long long, unsigned int>(mask, u.ui, delta, width);
716__BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_sync(
const unsigned long long mask,
717 const __hip_bfloat16 in,
const int delta,
718 const int width = __AMDGCN_WAVEFRONT_SIZE) {
726__BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_sync(
const unsigned long long mask,
727 const __hip_bfloat162 in,
const int delta,
728 const int width = __AMDGCN_WAVEFRONT_SIZE) {
729 static_assert(
sizeof(__hip_bfloat162) ==
sizeof(
unsigned int));
731 __hip_bfloat162 bf162;
734 u.ui = __shfl_sync(mask, u.ui, delta, width);
742__BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_up_sync(
const unsigned long long mask,
743 const __hip_bfloat16 in,
744 const unsigned int delta,
745 const int width = __AMDGCN_WAVEFRONT_SIZE) {
753__BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_up_sync(
const unsigned long long mask,
754 const __hip_bfloat162 in,
755 const unsigned int delta,
756 const int width = __AMDGCN_WAVEFRONT_SIZE) {
757 static_assert(
sizeof(__hip_bfloat162) ==
sizeof(
unsigned int));
759 __hip_bfloat162 bf162;
762 u.ui = __shfl_up_sync(mask, u.ui, delta, width);
770__BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_xor_sync(
const unsigned long long mask,
771 const __hip_bfloat16 in,
const int delta,
772 const int width = __AMDGCN_WAVEFRONT_SIZE) {
780__BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_xor_sync(
const unsigned long long mask,
781 const __hip_bfloat162 in,
const int delta,
782 const int width = __AMDGCN_WAVEFRONT_SIZE) {
783 static_assert(
sizeof(__hip_bfloat162) ==
sizeof(
unsigned int));
785 __hip_bfloat162 bf162;
788 u.ui = __shfl_xor_sync(mask, u.ui, delta, width);
797__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
__hadd(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
805__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
__hsub(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
813__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
__hdiv(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
821__BF16_DEVICE_STATIC__ __hip_bfloat16
__hfma(
const __hip_bfloat16 a,
const __hip_bfloat16 b,
822 const __hip_bfloat16 c) {
831__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
__hmul(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
839__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
__hneg(
const __hip_bfloat16 a) {
840 __hip_bfloat16_raw hr = a;
842 return __hip_bfloat16(hr);
849__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
__habs(
const __hip_bfloat16 a) {
850 __hip_bfloat16_raw hr = a;
852 return __hip_bfloat16(hr);
859__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
__h2div(
const __hip_bfloat162 a,
860 const __hip_bfloat162 b) {
869__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
__habs2(
const __hip_bfloat162 a) {
877__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
__hadd2(
const __hip_bfloat162 a,
878 const __hip_bfloat162 b) {
879 return __hip_bfloat162(
__hadd(a.x, b.x), __hadd(a.y, b.y));
886__BF16_DEVICE_STATIC__ __hip_bfloat162
__hfma2(
const __hip_bfloat162 a,
const __hip_bfloat162 b,
887 const __hip_bfloat162 c) {
888 return __hip_bfloat162(
__hfma(a.x, b.x, c.x),
__hfma(a.y, b.y, c.y));
895__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
__hmul2(
const __hip_bfloat162 a,
896 const __hip_bfloat162 b) {
897 return __hip_bfloat162(
__hmul(a.x, b.x),
__hmul(a.y, b.y));
904__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
__hneg2(
const __hip_bfloat162 a) {
912__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
__hsub2(
const __hip_bfloat162 a,
913 const __hip_bfloat162 b) {
914 return __hip_bfloat162(
__hsub(a.x, b.x),
__hsub(a.y, b.y));
921__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
operator*(
const __hip_bfloat16& l,
922 const __hip_bfloat16& r) {
930__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16&
operator*=(__hip_bfloat16& l,
const __hip_bfloat16& r) {
939__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
operator+(
const __hip_bfloat16& l) {
return l; }
945__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
operator+(
const __hip_bfloat16& l,
946 const __hip_bfloat16& r) {
954__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
operator-(
const __hip_bfloat16& l) {
return __hneg(l); }
960__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
operator-(
const __hip_bfloat16& l,
961 const __hip_bfloat16& r) {
969__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
operator++(__hip_bfloat16& l,
const int) {
971 l =
__hadd(l, HIPRT_ONE_BF16);
979__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16&
operator++(__hip_bfloat16& l) {
980 l =
__hadd(l, HIPRT_ONE_BF16);
988__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
operator--(__hip_bfloat16& l,
const int) {
990 l =
__hsub(l, HIPRT_ONE_BF16);
998__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16&
operator--(__hip_bfloat16& l) {
999 l =
__hsub(l, HIPRT_ONE_BF16);
1007__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16&
operator+=(__hip_bfloat16& l,
const __hip_bfloat16& r) {
1016__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16&
operator-=(__hip_bfloat16& l,
const __hip_bfloat16& r) {
1025__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
operator/(
const __hip_bfloat16& l,
1026 const __hip_bfloat16& r) {
1034__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16&
operator/=(__hip_bfloat16& l,
const __hip_bfloat16& r) {
1043__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
operator*(
const __hip_bfloat162& l,
1044 const __hip_bfloat162& r) {
1052__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162&
operator*=(__hip_bfloat162& l,
1053 const __hip_bfloat162& r) {
1062__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
operator+(
const __hip_bfloat162& l) {
return l; }
1068__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
operator+(
const __hip_bfloat162& l,
1069 const __hip_bfloat162& r) {
1077__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
operator-(
const __hip_bfloat162& l) {
1085__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
operator-(
const __hip_bfloat162& l,
1086 const __hip_bfloat162& r) {
1094__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
operator++(__hip_bfloat162& l,
const int) {
1096 l =
__hadd2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16});
1104__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162&
operator++(__hip_bfloat162& l) {
1105 l =
__hadd2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16});
1113__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
operator--(__hip_bfloat162& l,
const int) {
1115 l =
__hsub2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16});
1123__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162&
operator--(__hip_bfloat162& l) {
1124 l =
__hsub2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16});
1132__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162&
operator+=(__hip_bfloat162& l,
1133 const __hip_bfloat162& r) {
1142__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162&
operator-=(__hip_bfloat162& l,
1143 const __hip_bfloat162& r) {
1152__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
operator/(
const __hip_bfloat162& l,
1153 const __hip_bfloat162& r) {
1161__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162&
operator/=(__hip_bfloat162& l,
1162 const __hip_bfloat162& r) {
1171__BF16_HOST_DEVICE_STATIC__
bool __heq(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
1179__BF16_HOST_DEVICE_STATIC__
bool __hequ(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
1188__BF16_HOST_DEVICE_STATIC__
bool __hgt(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
1196__BF16_HOST_DEVICE_STATIC__
bool __hgtu(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
1204__BF16_HOST_DEVICE_STATIC__
bool __hge(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
1212__BF16_HOST_DEVICE_STATIC__
bool __hgeu(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
1220__BF16_HOST_DEVICE_STATIC__
bool __hne(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
1228__BF16_HOST_DEVICE_STATIC__
bool __hneu(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
1236__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
__hmax(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
1237#if __HIP_DEVICE_COMPILE__
1248__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16
__hmin(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
1249#if __HIP_DEVICE_COMPILE__
1260__BF16_HOST_DEVICE_STATIC__
bool __hlt(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
1268__BF16_HOST_DEVICE_STATIC__
bool __hltu(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
1276__BF16_HOST_DEVICE_STATIC__
bool __hle(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
1284__BF16_HOST_DEVICE_STATIC__
bool __hleu(
const __hip_bfloat16 a,
const __hip_bfloat16 b) {
1292__BF16_HOST_DEVICE_STATIC__
int __hisinf(
const __hip_bfloat16 a) {
1293 __hip_bfloat16_raw hr = a;
1294 return !(~hr.x & 0x7f80) && !(hr.x & 0x7f);
1301__BF16_HOST_DEVICE_STATIC__
bool __hisnan(
const __hip_bfloat16 a) {
1302 __hip_bfloat16_raw hr = a;
1303 return !(~hr.x & 0x7f80) && +(hr.x & 0x7f);
1310__BF16_HOST_DEVICE_STATIC__
bool __hbeq2(
const __hip_bfloat162 a,
const __hip_bfloat162 b) {
1318__BF16_HOST_DEVICE_STATIC__
bool __hbequ2(
const __hip_bfloat162 a,
const __hip_bfloat162 b) {
1326__BF16_HOST_DEVICE_STATIC__
bool __hbge2(
const __hip_bfloat162 a,
const __hip_bfloat162 b) {
1334__BF16_HOST_DEVICE_STATIC__
bool __hbgeu2(
const __hip_bfloat162 a,
const __hip_bfloat162 b) {
1342__BF16_HOST_DEVICE_STATIC__
bool __hbgt2(
const __hip_bfloat162 a,
const __hip_bfloat162 b) {
1350__BF16_HOST_DEVICE_STATIC__
bool __hbgtu2(
const __hip_bfloat162 a,
const __hip_bfloat162 b) {
1358__BF16_HOST_DEVICE_STATIC__
bool __hble2(
const __hip_bfloat162 a,
const __hip_bfloat162 b) {
1366__BF16_HOST_DEVICE_STATIC__
bool __hbleu2(
const __hip_bfloat162 a,
const __hip_bfloat162 b) {
1374__BF16_HOST_DEVICE_STATIC__
bool __hblt2(
const __hip_bfloat162 a,
const __hip_bfloat162 b) {
1382__BF16_HOST_DEVICE_STATIC__
bool __hbltu2(
const __hip_bfloat162 a,
const __hip_bfloat162 b) {
1390__BF16_HOST_DEVICE_STATIC__
bool __hbne2(
const __hip_bfloat162 a,
const __hip_bfloat162 b) {
1398__BF16_HOST_DEVICE_STATIC__
bool __hbneu2(
const __hip_bfloat162 a,
const __hip_bfloat162 b) {
1406__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
__heq2(
const __hip_bfloat162 a,
1407 const __hip_bfloat162 b) {
1408 return __hip_bfloat162{{
__heq(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
1409 {
__heq(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}};
1416__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
__hge2(
const __hip_bfloat162 a,
1417 const __hip_bfloat162 b) {
1418 return __hip_bfloat162{{
__hge(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
1419 {
__hge(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}};
1426__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
__hgt2(
const __hip_bfloat162 a,
1427 const __hip_bfloat162 b) {
1428 return __hip_bfloat162{{
__hgt(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
1429 {
__hgt(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ONE_BF16}};
1436__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
__hisnan2(
const __hip_bfloat162 a) {
1437 return __hip_bfloat162{{
__hisnan(a.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
1438 {
__hisnan(a.y) ? HIPRT_ONE_BF16 : HIPRT_ONE_BF16}};
1445__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
__hle2(
const __hip_bfloat162 a,
1446 const __hip_bfloat162 b) {
1447 return __hip_bfloat162{{
__hle(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
1448 {
__hle(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}};
1455__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
__hlt2(
const __hip_bfloat162 a,
1456 const __hip_bfloat162 b) {
1457 return __hip_bfloat162{{
__hlt(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
1458 {
__hlt(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}};
1465__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
__hmax2(
const __hip_bfloat162 a,
1466 const __hip_bfloat162 b) {
1467 return __hip_bfloat162(
__hmax(a.x, b.x),
__hmax(a.y, b.y));
1474__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
__hmin2(
const __hip_bfloat162 a,
1475 const __hip_bfloat162 b) {
1476 return __hip_bfloat162(
__hmin(a.x, b.x),
__hmin(a.y, b.y));
1483__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162
__hne2(
const __hip_bfloat162 a,
1484 const __hip_bfloat162 b) {
1485 return __hip_bfloat162{{
__hne(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
1486 {
__hne(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}};
1493__BF16_HOST_DEVICE_STATIC__
bool operator==(
const __hip_bfloat16& l,
const __hip_bfloat16& r) {
1501__BF16_HOST_DEVICE_STATIC__
bool operator!=(
const __hip_bfloat16& l,
const __hip_bfloat16& r) {
1509__BF16_HOST_DEVICE_STATIC__
bool operator<(
const __hip_bfloat16& l,
const __hip_bfloat16& r) {
1517__BF16_HOST_DEVICE_STATIC__
bool operator<=(
const __hip_bfloat16& l,
const __hip_bfloat16& r) {
1525__BF16_HOST_DEVICE_STATIC__
bool operator>(
const __hip_bfloat16& l,
const __hip_bfloat16& r) {
1533__BF16_HOST_DEVICE_STATIC__
bool operator>=(
const __hip_bfloat16& l,
const __hip_bfloat16& r) {
1541__BF16_HOST_DEVICE_STATIC__
bool operator==(
const __hip_bfloat162& l,
const __hip_bfloat162& r) {
1543 return ret.x != 0.0f && ret.y != 0.0f;
1550__BF16_HOST_DEVICE_STATIC__
bool operator!=(
const __hip_bfloat162& l,
const __hip_bfloat162& r) {
1558__BF16_HOST_DEVICE_STATIC__
bool operator<(
const __hip_bfloat162& l,
const __hip_bfloat162& r) {
1560 return fl.x < fr.x && fl.x < fr.y;
1567__BF16_HOST_DEVICE_STATIC__
bool operator<=(
const __hip_bfloat162& l,
const __hip_bfloat162& r) {
1569 return fl.x <= fr.x && fl.x <= fr.y;
1576__BF16_HOST_DEVICE_STATIC__
bool operator>(
const __hip_bfloat162& l,
const __hip_bfloat162& r) {
1578 return fl.x > fr.x && fl.x > fr.y;
1585__BF16_HOST_DEVICE_STATIC__
bool operator>=(
const __hip_bfloat162& l,
const __hip_bfloat162& r) {
1587 return fl.x >= fr.x && fl.x >= fr.y;
1594__BF16_DEVICE_STATIC__ __hip_bfloat16
hceil(
const __hip_bfloat16 h) {
1602__BF16_DEVICE_STATIC__ __hip_bfloat16
hcos(
const __hip_bfloat16 h) {
1610__BF16_DEVICE_STATIC__ __hip_bfloat16
hexp(
const __hip_bfloat16 h) {
1618__BF16_DEVICE_STATIC__ __hip_bfloat16
hexp10(
const __hip_bfloat16 h) {
1626__BF16_DEVICE_STATIC__ __hip_bfloat16
hexp2(
const __hip_bfloat16 h) {
1634__BF16_DEVICE_STATIC__ __hip_bfloat16
hfloor(
const __hip_bfloat16 h) {
1642__BF16_DEVICE_STATIC__ __hip_bfloat16
hlog(
const __hip_bfloat16 h) {
1650__BF16_DEVICE_STATIC__ __hip_bfloat16
hlog10(
const __hip_bfloat16 h) {
1658__BF16_DEVICE_STATIC__ __hip_bfloat16
hlog2(
const __hip_bfloat16 h) {
1666__BF16_DEVICE_STATIC__ __hip_bfloat16
hrcp(
const __hip_bfloat16 h) {
1674__BF16_DEVICE_STATIC__ __hip_bfloat16
hrint(
const __hip_bfloat16 h) {
1682__BF16_DEVICE_STATIC__ __hip_bfloat16
hrsqrt(
const __hip_bfloat16 h) {
1690__BF16_DEVICE_STATIC__ __hip_bfloat16
hsin(
const __hip_bfloat16 h) {
1698__BF16_DEVICE_STATIC__ __hip_bfloat16
hsqrt(
const __hip_bfloat16 h) {
1706__BF16_DEVICE_STATIC__ __hip_bfloat16
htrunc(
const __hip_bfloat16 h) {
1714__BF16_DEVICE_STATIC__ __hip_bfloat162
h2ceil(
const __hip_bfloat162 h) {
1715 return __hip_bfloat162(
hceil(h.x),
hceil(h.y));
1722__BF16_DEVICE_STATIC__ __hip_bfloat162
h2cos(
const __hip_bfloat162 h) {
1723 return __hip_bfloat162(
hcos(h.x),
hcos(h.y));
1730__BF16_DEVICE_STATIC__ __hip_bfloat162
h2exp(
const __hip_bfloat162 h) {
1731 return __hip_bfloat162(
hexp(h.x),
hexp(h.y));
1738__BF16_DEVICE_STATIC__ __hip_bfloat162
h2exp10(
const __hip_bfloat162 h) {
1746__BF16_DEVICE_STATIC__ __hip_bfloat162
h2exp2(
const __hip_bfloat162 h) {
1747 return __hip_bfloat162(
hexp2(h.x),
hexp2(h.y));
1754__BF16_DEVICE_STATIC__ __hip_bfloat162
h2floor(
const __hip_bfloat162 h) {
1762__BF16_DEVICE_STATIC__ __hip_bfloat162
h2log(
const __hip_bfloat162 h) {
1763 return __hip_bfloat162(
hlog(h.x),
hlog(h.y));
1770__BF16_DEVICE_STATIC__ __hip_bfloat162
h2log10(
const __hip_bfloat162 h) {
1778__BF16_DEVICE_STATIC__ __hip_bfloat162
h2log2(
const __hip_bfloat162 h) {
1779 return __hip_bfloat162(
hlog2(h.x),
hlog2(h.y));
1786__BF16_DEVICE_STATIC__ __hip_bfloat162
h2rcp(
const __hip_bfloat162 h) {
1787 return __hip_bfloat162(
hrcp(h.x),
hrcp(h.y));
1794__BF16_DEVICE_STATIC__ __hip_bfloat162
h2rint(
const __hip_bfloat162 h) {
1795 return __hip_bfloat162(
hrint(h.x),
hrint(h.y));
1802__BF16_DEVICE_STATIC__ __hip_bfloat162
h2rsqrt(
const __hip_bfloat162 h) {
1810__BF16_DEVICE_STATIC__ __hip_bfloat162
h2sin(
const __hip_bfloat162 h) {
1811 return __hip_bfloat162(
hsin(h.x),
hsin(h.y));
1818__BF16_DEVICE_STATIC__ __hip_bfloat162
h2sqrt(
const __hip_bfloat162 h) {
1819 return __hip_bfloat162(
hsqrt(h.x),
hsqrt(h.y));
1826__BF16_DEVICE_STATIC__ __hip_bfloat162
h2trunc(
const __hip_bfloat162 h) {
1830#if defined(__clang__) && defined(__HIP__)
1835__BF16_DEVICE_STATIC__ __hip_bfloat162 unsafeAtomicAdd(__hip_bfloat162* address,
1836 __hip_bfloat162 value) {
1837#if __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
1838 typedef short __attribute__((ext_vector_type(2))) vec_short2;
1839 static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw));
1841 __hip_bfloat162_raw bf162_raw;
1843 } u{
static_cast<__hip_bfloat162_raw
>(value)};
1844 u.vs2 = __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)address, u.vs2);
1845 return static_cast<__hip_bfloat162
>(u.bf162_raw);
1847 static_assert(
sizeof(
unsigned int) ==
sizeof(__hip_bfloat162_raw));
1849 __hip_bfloat162_raw h2r;
1852 u_hold old_val, new_val;
1854 __hip_atomic_load((
unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1856 new_val.h2r =
__hadd2(old_val.h2r, value);
1857 }
while (!__hip_atomic_compare_exchange_strong((
unsigned int*)address, &old_val.u32, new_val.u32,
1858 __ATOMIC_RELAXED, __ATOMIC_RELAXED,
1859 __HIP_MEMORY_SCOPE_AGENT));
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const __hip_bfloat16 b)
Subtracts two bfloat16 values.
Definition amd_hip_bf16.h:805
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 & operator-=(__hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to subtract-assign two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1016
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator+(const __hip_bfloat16 &l)
Operator to unary+ on a __hip_bfloat16 number.
Definition amd_hip_bf16.h:939
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator/(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to divide two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1025
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator-(const __hip_bfloat16 &l)
Operator to negate a __hip_bfloat16 number.
Definition amd_hip_bf16.h:954
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hneg(const __hip_bfloat16 a)
Negate a bfloat16 value.
Definition amd_hip_bf16.h:839
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const __hip_bfloat16 b)
Adds two bfloat16 values.
Definition amd_hip_bf16.h:797
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 & operator/=(__hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to divide-assign two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1034
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator*(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to multiply two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:921
__BF16_DEVICE_STATIC__ __hip_bfloat16 __hfma(const __hip_bfloat16 a, const __hip_bfloat16 b, const __hip_bfloat16 c)
Performs FMA of given bfloat16 values.
Definition amd_hip_bf16.h:821
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 & operator*=(__hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to multiply-assign two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:930
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const __hip_bfloat16 b)
Multiplies two bfloat16 values.
Definition amd_hip_bf16.h:831
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator++(__hip_bfloat16 &l, const int)
Operator to post increment a __hip_bfloat16 number.
Definition amd_hip_bf16.h:969
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 & operator+=(__hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to add-assign two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1007
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __habs(const __hip_bfloat16 a)
Returns absolute of a bfloat16.
Definition amd_hip_bf16.h:849
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator--(__hip_bfloat16 &l, const int)
Operator to post decrement a __hip_bfloat16 number.
Definition amd_hip_bf16.h:988
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hdiv(const __hip_bfloat16 a, const __hip_bfloat16 b)
Divides two bfloat16 values.
Definition amd_hip_bf16.h:813
__BF16_HOST_DEVICE_STATIC__ bool __heq(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values.
Definition amd_hip_bf16.h:1171
__BF16_HOST_DEVICE_STATIC__ bool __hgt(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - greater than.
Definition amd_hip_bf16.h:1188
__BF16_HOST_DEVICE_STATIC__ bool __hne(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - not equal.
Definition amd_hip_bf16.h:1220
__BF16_HOST_DEVICE_STATIC__ bool operator==(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform an equal compare on two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1493
__BF16_HOST_DEVICE_STATIC__ bool __hle(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - less than equal.
Definition amd_hip_bf16.h:1276
__BF16_HOST_DEVICE_STATIC__ bool operator!=(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform a not equal on two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1501
__BF16_HOST_DEVICE_STATIC__ bool operator>(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform a greater than on two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1525
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmin(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - return min.
Definition amd_hip_bf16.h:1248
__BF16_HOST_DEVICE_STATIC__ bool __hge(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - greater than equal.
Definition amd_hip_bf16.h:1204
__BF16_HOST_DEVICE_STATIC__ bool __hleu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered less than equal.
Definition amd_hip_bf16.h:1284
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmax(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - return max.
Definition amd_hip_bf16.h:1236
__BF16_HOST_DEVICE_STATIC__ bool operator<=(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform a less than equal on two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1517
__BF16_HOST_DEVICE_STATIC__ int __hisinf(const __hip_bfloat16 a)
Checks if number is inf.
Definition amd_hip_bf16.h:1292
__BF16_HOST_DEVICE_STATIC__ bool __hneu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered not equal.
Definition amd_hip_bf16.h:1228
__BF16_HOST_DEVICE_STATIC__ bool __hgtu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered greater than.
Definition amd_hip_bf16.h:1196
__BF16_HOST_DEVICE_STATIC__ bool operator<(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform a less than on two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1509
__BF16_HOST_DEVICE_STATIC__ bool __hltu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered less than.
Definition amd_hip_bf16.h:1268
__BF16_HOST_DEVICE_STATIC__ bool __hgeu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered greater than equal.
Definition amd_hip_bf16.h:1212
__BF16_HOST_DEVICE_STATIC__ bool __hlt(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - less than operator.
Definition amd_hip_bf16.h:1260
__BF16_HOST_DEVICE_STATIC__ bool operator>=(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform a greater than equal on two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1533
__BF16_HOST_DEVICE_STATIC__ bool __hisnan(const __hip_bfloat16 a)
Checks if number is nan.
Definition amd_hip_bf16.h:1301
__BF16_HOST_DEVICE_STATIC__ bool __hequ(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered equal.
Definition amd_hip_bf16.h:1179
__BF16_HOST_DEVICE_STATIC__ bool __hbgeu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a >= b - unordered.
Definition amd_hip_bf16.h:1334
__BF16_HOST_DEVICE_STATIC__ bool __hbgt2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a > b.
Definition amd_hip_bf16.h:1342
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmax2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Returns max of two elements.
Definition amd_hip_bf16.h:1465
__BF16_HOST_DEVICE_STATIC__ bool __hble2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a <= b.
Definition amd_hip_bf16.h:1358
__BF16_HOST_DEVICE_STATIC__ bool __hblt2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a < b.
Definition amd_hip_bf16.h:1374
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmin2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Returns min of two elements.
Definition amd_hip_bf16.h:1474
__BF16_HOST_DEVICE_STATIC__ bool __hbleu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a <= b - unordered.
Definition amd_hip_bf16.h:1366
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hne2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Checks for not equal to.
Definition amd_hip_bf16.h:1483
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hge2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a >= b, returns 1.0 if greater than equal, otherwise 0.0.
Definition amd_hip_bf16.h:1416
__BF16_HOST_DEVICE_STATIC__ bool __hbne2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a != b.
Definition amd_hip_bf16.h:1390
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hlt2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a < b, returns 1.0 if greater than equal, otherwise 0.0.
Definition amd_hip_bf16.h:1455
__BF16_HOST_DEVICE_STATIC__ bool __hbltu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a < b - unordered.
Definition amd_hip_bf16.h:1382
__BF16_HOST_DEVICE_STATIC__ bool __hbequ2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Checks if two numbers are equal - unordered.
Definition amd_hip_bf16.h:1318
__BF16_HOST_DEVICE_STATIC__ bool __hbeq2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Checks if two numbers are equal.
Definition amd_hip_bf16.h:1310
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hle2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a <= b, returns 1.0 if greater than equal, otherwise 0.0.
Definition amd_hip_bf16.h:1445
__BF16_HOST_DEVICE_STATIC__ bool __hbneu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a != b.
Definition amd_hip_bf16.h:1398
__BF16_HOST_DEVICE_STATIC__ bool __hbgtu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a > b - unordered.
Definition amd_hip_bf16.h:1350
__BF16_HOST_DEVICE_STATIC__ bool __hbge2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a >= b.
Definition amd_hip_bf16.h:1326
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __heq2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a != b, returns 1.0 if equal, otherwise 0.0.
Definition amd_hip_bf16.h:1406
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hgt2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a > b, returns 1.0 if greater than equal, otherwise 0.0.
Definition amd_hip_bf16.h:1426
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hisnan2(const __hip_bfloat162 a)
Check for a is NaN, returns 1.0 if NaN, otherwise 0.0.
Definition amd_hip_bf16.h:1436
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __habs2(const __hip_bfloat162 a)
Returns absolute of a bfloat162.
Definition amd_hip_bf16.h:869
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Subtracts two bfloat162 values.
Definition amd_hip_bf16.h:912
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Multiplies two bfloat162 values.
Definition amd_hip_bf16.h:895
__BF16_DEVICE_STATIC__ __hip_bfloat162 __hfma2(const __hip_bfloat162 a, const __hip_bfloat162 b, const __hip_bfloat162 c)
Performs FMA of given bfloat162 values.
Definition amd_hip_bf16.h:886
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Adds two bfloat162 values.
Definition amd_hip_bf16.h:877
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hneg2(const __hip_bfloat162 a)
Converts a bfloat162 into negative.
Definition amd_hip_bf16.h:904
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __h2div(const __hip_bfloat162 a, const __hip_bfloat162 b)
Divides bfloat162 values.
Definition amd_hip_bf16.h:859
__BF16_HOST_DEVICE_STATIC__ float __bfloat162float(__hip_bfloat16 a)
Converts bfloat16 to float.
Definition amd_hip_bf16.h:504
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __float2bfloat16(float f)
Converts float to bfloat16.
Definition amd_hip_bf16.h:513
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __bfloat162bfloat162(const __hip_bfloat16 a)
Moves bfloat16 value to bfloat162.
Definition amd_hip_bf16.h:531
__BF16_HOST_DEVICE_STATIC__ unsigned short int __bfloat16_as_ushort(const __hip_bfloat16 h)
Reinterprets bits in a __hip_bfloat16 as an unsigned signed short integer.
Definition amd_hip_bf16.h:552
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned short int a)
Reinterprets unsigned short int into a bfloat16.
Definition amd_hip_bf16.h:674
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __low2bfloat16(const __hip_bfloat162 a)
Returns low 16 bits of __hip_bfloat162.
Definition amd_hip_bf16.h:622
__BF16_HOST_DEVICE_STATIC__ float2 __bfloat1622float2(const __hip_bfloat162 a)
Converts and moves bfloat162 to float2.
Definition amd_hip_bf16.h:522
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __halves2bfloat162(const __hip_bfloat16 a, const __hip_bfloat16 b)
Combine two __hip_bfloat16 to __hip_bfloat162.
Definition amd_hip_bf16.h:582
__BF16_HOST_DEVICE_STATIC__ short int __bfloat16_as_short(const __hip_bfloat16 h)
Reinterprets bits in a __hip_bfloat16 as a signed short integer.
Definition amd_hip_bf16.h:539
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __double2bfloat16(const double a)
Convert double to __hip_bfloat16.
Definition amd_hip_bf16.h:565
__BF16_HOST_DEVICE_STATIC__ float __low2float(const __hip_bfloat162 a)
Converts low 16 bits of __hip_bfloat162 to float and returns the result.
Definition amd_hip_bf16.h:636
__BF16_HOST_DEVICE_STATIC__ float __high2float(const __hip_bfloat162 a)
Converts high 16 bits of __hip_bfloat162 to float and returns the result.
Definition amd_hip_bf16.h:605
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __low2bfloat162(const __hip_bfloat162 a)
Returns low 16 bits of __hip_bfloat162.
Definition amd_hip_bf16.h:628
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __float22bfloat162_rn(const float2 a)
Convert float2 to __hip_bfloat162.
Definition amd_hip_bf16.h:574
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __highs2bfloat162(const __hip_bfloat162 a, const __hip_bfloat162 b)
Extracts high 16 bits from each and combines them.
Definition amd_hip_bf16.h:613
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __lows2bfloat162(const __hip_bfloat162 a, const __hip_bfloat162 b)
Extracts low 16 bits from each and combines them.
Definition amd_hip_bf16.h:652
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __high2bfloat16(const __hip_bfloat162 a)
Returns high 16 bits of __hip_bfloat162.
Definition amd_hip_bf16.h:591
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __high2bfloat162(const __hip_bfloat162 a)
Returns high 16 bits of __hip_bfloat162.
Definition amd_hip_bf16.h:597
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __short_as_bfloat16(const short int a)
Reinterprets short int into a bfloat16.
Definition amd_hip_bf16.h:661
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __lowhigh2highlow(const __hip_bfloat162 a)
Swaps both halves.
Definition amd_hip_bf16.h:644
__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp10(const __hip_bfloat16 h)
Calculate exponential 10 of bfloat16.
Definition amd_hip_bf16.h:1618
__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog(const __hip_bfloat16 h)
Calculate natural log of bfloat16.
Definition amd_hip_bf16.h:1642
__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp2(const __hip_bfloat16 h)
Calculate exponential 2 of bfloat16.
Definition amd_hip_bf16.h:1626
__BF16_DEVICE_STATIC__ __hip_bfloat16 hceil(const __hip_bfloat16 h)
Calculate ceil of bfloat16.
Definition amd_hip_bf16.h:1594
__BF16_DEVICE_STATIC__ __hip_bfloat16 hrcp(const __hip_bfloat16 h)
Calculate reciprocal.
Definition amd_hip_bf16.h:1666
__BF16_DEVICE_STATIC__ __hip_bfloat16 hsqrt(const __hip_bfloat16 h)
Calculate sqrt of bfloat16.
Definition amd_hip_bf16.h:1698
__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog10(const __hip_bfloat16 h)
Calculate log 10 of bfloat16.
Definition amd_hip_bf16.h:1650
__BF16_DEVICE_STATIC__ __hip_bfloat16 hsin(const __hip_bfloat16 h)
Calculate sin of bfloat16.
Definition amd_hip_bf16.h:1690
__BF16_DEVICE_STATIC__ __hip_bfloat16 hfloor(const __hip_bfloat16 h)
Calculate floor of bfloat16.
Definition amd_hip_bf16.h:1634
__BF16_DEVICE_STATIC__ __hip_bfloat16 hrint(const __hip_bfloat16 h)
Round to nearest int.
Definition amd_hip_bf16.h:1674
__BF16_DEVICE_STATIC__ __hip_bfloat16 htrunc(const __hip_bfloat16 h)
Calculate truncate of bfloat16.
Definition amd_hip_bf16.h:1706
__BF16_DEVICE_STATIC__ __hip_bfloat16 hrsqrt(const __hip_bfloat16 h)
Reciprocal square root.
Definition amd_hip_bf16.h:1682
__BF16_DEVICE_STATIC__ __hip_bfloat16 hcos(const __hip_bfloat16 h)
Calculate cosine of bfloat16.
Definition amd_hip_bf16.h:1602
__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog2(const __hip_bfloat16 h)
Calculate log 2 of bfloat16.
Definition amd_hip_bf16.h:1658
__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp(const __hip_bfloat16 h)
Calculate exponential of bfloat16.
Definition amd_hip_bf16.h:1610
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2trunc(const __hip_bfloat162 h)
Calculate truncate of bfloat162.
Definition amd_hip_bf16.h:1826
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rcp(const __hip_bfloat162 h)
Calculate vector reciprocal.
Definition amd_hip_bf16.h:1786
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log(const __hip_bfloat162 h)
Calculate natural log of bfloat162.
Definition amd_hip_bf16.h:1762
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp(const __hip_bfloat162 h)
Calculate exponential of bfloat162.
Definition amd_hip_bf16.h:1730
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2cos(const __hip_bfloat162 h)
Calculate cosine of bfloat162.
Definition amd_hip_bf16.h:1722
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2sin(const __hip_bfloat162 h)
Calculate sin of bfloat162.
Definition amd_hip_bf16.h:1810
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log2(const __hip_bfloat162 h)
Calculate log 2 of bfloat162.
Definition amd_hip_bf16.h:1778
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2ceil(const __hip_bfloat162 h)
Calculate ceil of bfloat162.
Definition amd_hip_bf16.h:1714
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2floor(const __hip_bfloat162 h)
Calculate floor of bfloat162.
Definition amd_hip_bf16.h:1754
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp10(const __hip_bfloat162 h)
Calculate exponential 10 of bfloat162.
Definition amd_hip_bf16.h:1738
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp2(const __hip_bfloat162 h)
Calculate exponential 2 of bfloat162.
Definition amd_hip_bf16.h:1746
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log10(const __hip_bfloat162 h)
Calculate log 10 of bfloat162.
Definition amd_hip_bf16.h:1770
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rsqrt(const __hip_bfloat162 h)
Calculate vector reciprocal square root.
Definition amd_hip_bf16.h:1802
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rint(const __hip_bfloat162 h)
Calculate vector round to nearest int.
Definition amd_hip_bf16.h:1794
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2sqrt(const __hip_bfloat162 h)
Calculate sqrt of bfloat162.
Definition amd_hip_bf16.h:1818
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Definition hip_fp16_math_fwd.h:57
Definition amd_hip_vector_types.h:2035