fss/vdpf_8cuh_source.html

// SPDX-License-Identifier: Apache-2.0

#pragma once

#include <cuda_runtime.h>

#include <cuda/std/array>

#include <cuda/std/span>

#include <cuda/std/tuple>

#include <type_traits>

#include <cstddef>

#include <cassert>

#include <omp.h>

#include <fss/group.cuh>

#include <fss/prg.cuh>

#include <fss/hash.cuh>

#include <fss/util.cuh>


namespace fss {


template <int in_bits, typename Group, typename Prg, typename XorHash, typename Hash, typename In = uint,

    int par_depth = -1>

  requires((std::is_unsigned_v<In> || std::is_same_v<In, __uint128_t>) && in_bits <= sizeof(In) * 8 &&

      Groupable<Group> && Prgable<Prg, 2> && XorHashable<XorHash> && Hashable<Hash>)


class Vdpf {

public:

  Prg prg;

  XorHash xor_hash;

  Hash hash;


  struct __align__(32) Cw {

    int4 s;

    bool tr;

  };


  // For only 1 and aligned memory access on GPU

  static_assert(sizeof(Cw) == 32);


  __host__ __device__ int Gen(

      Cw cws[], cuda::std::array<int4, 4> &cs, int4 &ocw, cuda::std::span<const int4, 2> s0s, In a, int4 b_buf) {

    int4 s0 = s0s[0];

    s0 = util::SetLsb(s0, false);

    int4 s1 = s0s[1];

    s1 = util::SetLsb(s1, false);

    bool t0 = false;

    bool t1 = true;

    b_buf = util::SetLsb(b_buf, false);


    for (int i = 0; i < in_bits; ++i) {

      auto [s0l, s0r] = prg.Gen(s0);

      auto [s1l, s1r] = prg.Gen(s1);


      bool t0l = util::GetLsb(s0l);

      s0l = util::SetLsb(s0l, false);

      bool t0r = util::GetLsb(s0r);

      s0r = util::SetLsb(s0r, false);

      bool t1l = util::GetLsb(s1l);

      s1l = util::SetLsb(s1l, false);

      bool t1r = util::GetLsb(s1r);

      s1r = util::SetLsb(s1r, false);


      bool a_bit = (a >> (in_bits - 1 - i)) & 1;


      int4 s_cw;

      if (!a_bit) s_cw = util::Xor(s0r, s1r);

      else s_cw = util::Xor(s0l, s1l);


      bool tl_cw = t0l ^ t1l ^ a_bit ^ 1;

      bool tr_cw = t0r ^ t1r ^ a_bit;


      if (!a_bit) {

        s0 = s0l;

        if (t0) s0 = util::Xor(s0, s_cw);

        s1 = s1l;

        if (t1) s1 = util::Xor(s1, s_cw);


        if (t0) t0 = t0l ^ tl_cw;

        else t0 = t0l;

        if (t1) t1 = t1l ^ tl_cw;

        else t1 = t1l;

      } else {

        s0 = s0r;

        if (t0) s0 = util::Xor(s0, s_cw);

        s1 = s1r;

        if (t1) s1 = util::Xor(s1, s_cw);


        if (t0) t0 = t0r ^ tr_cw;

        else t0 = t0r;

        if (t1) t1 = t1r ^ tr_cw;

        else t1 = t1r;

      }


      s_cw = util::SetLsb(s_cw, tl_cw);

      cws[i] = {s_cw, tr_cw};

    }


    // Verification hash

    int4 a_buf = util::Pack(a);


    auto pi_tilde_0 = xor_hash.Hash(cuda::std::tuple<int4, const int4>{a_buf, s0});

    auto pi_tilde_1 = xor_hash.Hash(cuda::std::tuple<int4, const int4>{a_buf, s1});

    cs = util::Xor(cuda::std::span<const int4, 4>(pi_tilde_0), cuda::std::span<const int4, 4>(pi_tilde_1));


    // Check retry condition

    if (t0 == t1) return 1;


    // Output correction word

    auto v_cw = Group::From(b_buf) + (-Group::From(s0)) + Group::From(s1);

    if (t1) v_cw = -v_cw;

    ocw = v_cw.Into();


    return 0;

  }


  __host__ __device__ cuda::std::array<int4, 4> Eval(

      bool b, int4 s0, cuda::std::span<const Cw> cws, cuda::std::span<const int4, 4> cs, int4 ocw, In x, int4 &y) {

    int4 s = s0;

    s = util::SetLsb(s, false);

    bool t = b;


    for (int i = 0; i < in_bits; ++i) {

      Cw cw = cws[i];

      int4 s_cw = cw.s;

      bool tl_cw = util::GetLsb(s_cw);

      s_cw = util::SetLsb(s_cw, false);

      bool tr_cw = cw.tr;


      auto [sl, sr] = prg.Gen(s);


      bool tl = util::GetLsb(sl);

      sl = util::SetLsb(sl, false);

      bool tr = util::GetLsb(sr);

      sr = util::SetLsb(sr, false);


      if (t) {

        sl = util::Xor(sl, s_cw);

        sr = util::Xor(sr, s_cw);

        tl = tl ^ tl_cw;

        tr = tr ^ tr_cw;

      }


      bool x_bit = (x >> (in_bits - 1 - i)) & 1;


      if (!x_bit) {

        s = sl;

        t = tl;

      } else {

        s = sr;

        t = tr;

      }

    }


    // Output share

    auto g = Group::From(s);

    assert((ocw.w & 1) == 0);

    if (t) g = g + Group::From(ocw);

    if (b) g = -g;

    y = g.Into();


    // Corrected verification hash

    int4 x_buf = util::Pack(x);


    auto pi_tilde = xor_hash.Hash(cuda::std::tuple<int4, const int4>{x_buf, s});

    if (t) {

      return util::Xor(cuda::std::span<const int4, 4>(pi_tilde), cuda::std::span<const int4, 4>(cs));

    }

    return pi_tilde;

  }


  void Prove(cuda::std::span<const cuda::std::array<int4, 4>> pi_tildes, cuda::std::span<const int4, 4> cs,

      cuda::std::array<int4, 4> &pi) {

    pi = {cs[0], cs[1], cs[2], cs[3]};

    for (size_t i = 0; i < pi_tildes.size(); ++i) {

      cuda::std::array<int4, 4> h_input =

          util::Xor(cuda::std::span<const int4, 4>(pi), cuda::std::span<const int4, 4>(pi_tildes[i]));

      auto h_out = hash.Hash(cuda::std::span<const int4, 4>(h_input));

      pi[0] = util::Xor(pi[0], h_out[0]);

      pi[1] = util::Xor(pi[1], h_out[1]);

    }

  }


  __host__ __device__ static bool Verify(cuda::std::span<const int4, 4> pi0, cuda::std::span<const int4, 4> pi1) {

    for (int i = 0; i < 4; ++i) {

      if (pi0[i].x != pi1[i].x || pi0[i].y != pi1[i].y || pi0[i].z != pi1[i].z || pi0[i].w != pi1[i].w) return false;

    }

    return true;

  }


  void EvalAll(bool b, int4 s0, cuda::std::span<const Cw> cws, cuda::std::span<const int4, 4> cs, int4 ocw,

      cuda::std::span<int4> ys, cuda::std::array<int4, 4> &pi) {

    int4 st = s0;

    bool t = b;

    st = util::SetLsb(st, t);


    assert(in_bits < sizeof(size_t) * 8);

    size_t l = 0;

    size_t r = 1ULL << in_bits;

    int i = 0;


    int par_depth_ = util::ResolveParDepth(par_depth);


    // Phase 1: tree traversal, store (s, t) packed into ys temporarily

#pragma omp parallel

#pragma omp single

    EvalTree(st, cws, ys, l, r, i, par_depth_);


    // Phase 2: sequential output computation and proof accumulation

    pi = {cs[0], cs[1], cs[2], cs[3]};

    size_t n = 1ULL << in_bits;

    assert((ocw.w & 1) == 0);

    auto ocw_group = Group::From(ocw);

    for (size_t j = 0; j < n; ++j) {

      int4 sj = ys[j];

      bool tj = util::GetLsb(sj);

      sj = util::SetLsb(sj, false);


      // Output share

      auto g = Group::From(sj);

      if (tj) g = g + ocw_group;

      if (b) g = -g;

      ys[j] = g.Into();


      // Proof accumulation

      int4 x_buf = util::Pack(static_cast<In>(j));


      auto pi_tilde = xor_hash.Hash(cuda::std::tuple<int4, const int4>{x_buf, sj});

      if (tj) {

        pi_tilde = util::Xor(cuda::std::span<const int4, 4>(pi_tilde), cuda::std::span<const int4, 4>(cs));

      }


      cuda::std::array<int4, 4> h_input =

          util::Xor(cuda::std::span<const int4, 4>(pi), cuda::std::span<const int4, 4>(pi_tilde));

      auto h_out = hash.Hash(cuda::std::span<const int4, 4>(h_input));

      pi[0] = util::Xor(pi[0], h_out[0]);

      pi[1] = util::Xor(pi[1], h_out[1]);

    }

  }


private:

  void EvalTree(

      int4 st, cuda::std::span<const Cw> cws, cuda::std::span<int4> ys, size_t l, size_t r, int i, int par_depth_) {

    if (i == in_bits) {

      assert(l + 1 == r);

      ys[l] = st;

      return;

    }


    bool t = util::GetLsb(st);

    int4 s = st;

    s = util::SetLsb(s, false);


    Cw cw = cws[i];

    int4 s_cw = cw.s;

    bool tl_cw = util::GetLsb(s_cw);

    s_cw = util::SetLsb(s_cw, false);

    bool tr_cw = cw.tr;


    auto [sl, sr] = prg.Gen(s);


    bool tl = util::GetLsb(sl);

    sl = util::SetLsb(sl, false);

    bool tr = util::GetLsb(sr);

    sr = util::SetLsb(sr, false);


    if (t) {

      sl = util::Xor(sl, s_cw);

      sr = util::Xor(sr, s_cw);

      tl = tl ^ tl_cw;

      tr = tr ^ tr_cw;

    }


    int4 stl = sl;

    stl = util::SetLsb(stl, tl);

    int4 str = sr;

    str = util::SetLsb(str, tr);


    size_t mid = (l + r) / 2;


    if (i < par_depth_) {

#pragma omp task

      EvalTree(stl, cws, ys, l, mid, i + 1, par_depth_);

#pragma omp task

      EvalTree(str, cws, ys, mid, r, i + 1, par_depth_);

#pragma omp taskwait

    } else {

      EvalTree(stl, cws, ys, l, mid, i + 1, par_depth_);

      EvalTree(str, cws, ys, mid, r, i + 1, par_depth_);

    }

  }

};


}  // namespace fss

fss::Vdpf
2-party VDPF scheme.
Definition vdpf.cuh:67

fss::Vdpf::Gen
int Gen(Cw cws[], cuda::std::array< int4, 4 > &cs, int4 &ocw, cuda::std::span< const int4, 2 > s0s, In a, int4 b_buf)
Key generation method.
Definition vdpf.cuh:101

fss::Vdpf::Prove
void Prove(cuda::std::span< const cuda::std::array< int4, 4 > > pi_tildes, cuda::std::span< const int4, 4 > cs, cuda::std::array< int4, 4 > &pi)
Proof accumulation method.
Definition vdpf.cuh:253

fss::Vdpf::Eval
cuda::std::array< int4, 4 > Eval(bool b, int4 s0, cuda::std::span< const Cw > cws, cuda::std::span< const int4, 4 > cs, int4 ocw, In x, int4 &y)
Evaluation method.
Definition vdpf.cuh:189

fss::Vdpf::Verify
static bool Verify(cuda::std::span< const int4, 4 > pi0, cuda::std::span< const int4, 4 > pi1)
Verification method.
Definition vdpf.cuh:270

fss::Vdpf::EvalAll
void EvalAll(bool b, int4 s0, cuda::std::span< const Cw > cws, cuda::std::span< const int4, 4 > cs, int4 ocw, cuda::std::span< int4 > ys, cuda::std::array< int4, 4 > &pi)
Full domain evaluation method.
Definition vdpf.cuh:293

Groupable
Group interface.
Definition group.cuh:40

Hashable
Collision-resistant hash interface.
Definition hash.cuh:19

Prgable
Pseudorandom generator (PRG) interface.
Definition prg.cuh:21

XorHashable
Collision-resistant and XOR-collision-resistant hash interface.
Definition hash.cuh:27

group.cuh

hash.cuh

prg.cuh

fss::Vdpf::Cw
Correction word.
Definition vdpf.cuh:81

util.cuh