29 __host__ __device__
static int RotateRight(
int a) {
30 auto u =
static_cast<unsigned>(a);
31 return static_cast<int>((u >> n) | (u << (32 - n)));
34 __host__ __device__
static void G(
int &a,
int &b,
int &c,
int &d,
int x,
int y) {
36 d = RotateRight<16>(d ^ a);
38 b = RotateRight<12>(b ^ c);
40 d = RotateRight<8>(d ^ a);
42 b = RotateRight<7>(b ^ c);
45 __host__ __device__
static int GetWord(
const int4 m[4],
int i) {
46 return reinterpret_cast<const int *
>(m)[i];
49 __host__ __device__
static void SetWord(int4 m[4],
int i,
int val) {
50 reinterpret_cast<int *
>(m)[i] = val;
53 __host__ __device__
static void Permute(int4 m[4]) {
54 constexpr int perm[16] = {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8};
56 for (
int i = 0; i < 16; ++i) {
57 tmp[i] = GetWord(m, perm[i]);
59 for (
int i = 0; i < 16; ++i) {
60 SetWord(m, i, tmp[i]);
64 __host__ __device__
static void Round(int4 v[4],
const int4 m[4]) {
66 G(v[0].x, v[1].x, v[2].x, v[3].x, GetWord(m, 0), GetWord(m, 1));
67 G(v[0].y, v[1].y, v[2].y, v[3].y, GetWord(m, 2), GetWord(m, 3));
68 G(v[0].z, v[1].z, v[2].z, v[3].z, GetWord(m, 4), GetWord(m, 5));
69 G(v[0].w, v[1].w, v[2].w, v[3].w, GetWord(m, 6), GetWord(m, 7));
71 G(v[0].x, v[1].y, v[2].z, v[3].w, GetWord(m, 8), GetWord(m, 9));
72 G(v[0].y, v[1].z, v[2].w, v[3].x, GetWord(m, 10), GetWord(m, 11));
73 G(v[0].z, v[1].w, v[2].x, v[3].y, GetWord(m, 12), GetWord(m, 13));
74 G(v[0].w, v[1].x, v[2].y, v[3].z, GetWord(m, 14), GetWord(m, 15));
78 constexpr static int4 kIv0 = {
79 static_cast<int>(0x6A09E667),
80 static_cast<int>(0xBB67AE85),
81 static_cast<int>(0x3C6EF372),
82 static_cast<int>(0xA54FF53A),
85 constexpr static int kChunkStart = 1;
86 constexpr static int kChunkEnd = 2;
87 constexpr static int kRoot = 8;
88 constexpr static int kKeyedHash = 16;
100 __host__ __device__
static cuda::std::array<int4, 4> Compress(
101 const int4 h[2],
const int4 msg[4],
unsigned long long counter,
int block_len,
int flags) {
106 v[3] = {
static_cast<int>(counter & 0xFFFFFFFF),
107 static_cast<int>((counter >> 32) & 0xFFFFFFFF), block_len, flags};
109 int4 m[4] = {msg[0], msg[1], msg[2], msg[3]};
112 for (
int i = 0; i < 7; ++i) {
120 v[0] = util::Xor(v[0], v[2]);
121 v[1] = util::Xor(v[1], v[3]);
122 v[2] = util::Xor(v[2], h[0]);
123 v[3] = util::Xor(v[3], h[1]);
125 return {v[0], v[1], v[2], v[3]};
135 __host__ __device__
explicit Blake3(cuda::std::span<const int4, 2> iv) : iv_{iv[0], iv[1]} {}
146 __host__ __device__ cuda::std::array<int4, 2>
Hash(cuda::std::span<const int4, 4> msg) {
147 constexpr int flags = kChunkStart | kChunkEnd | kRoot | kKeyedHash;
148 auto out = Compress(iv_, msg.data(), 0, 64, flags);
149 return {out[0], out[1]};
161 __host__ __device__ cuda::std::array<int4, 4>
Hash(cuda::std::tuple<int4, const int4> msg) {
162 constexpr int flags = kChunkStart | kChunkEnd | kRoot | kKeyedHash;
164 int4 padded[4] = {util::SetLsb(a,
false), b, {0, 0, 0, 0}, {0, 0, 0, 0}};
166 auto out0 = Compress(iv_, padded, 0, 32, flags);
168 padded[0] = util::SetLsb(a,
true);
169 auto out1 = Compress(iv_, padded, 0, 32, flags);
171 return {out0[0], out0[1], out1[0], out1[1]};