#ifndef __HOST_PIPES_HPP__
#define __HOST_PIPES_HPP__

#include <atomic>
#include <limits>
#include <type_traits>

#include <sycl/sycl.hpp>
#include <sycl/ext/intel/fpga_extensions.hpp>
#include <sycl/ext/intel/prototype/pipes.hpp>

////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// Local Utilities
// casts a pointer to volatile and loads from it
namespace detail {
template <typename T>
constexpr T Pow2(T n) {
  return T(1) << n;
}

// returns whether 'n' is a power of 2
template <typename T>
constexpr bool IsPow2(T n) {
  return (n != 0) && ((n & (n - 1)) == 0);
}

// returns log2(n) rounding down
template <typename T>
constexpr T Log2(T n) {
  static_assert(std::is_integral_v<T>);
  if (n < 2) {
    return T(0);
  } else {
    T ret = 0;
    while (n >= 2) {
      ret++;
      n /= 2;
    }
    return ret;
  }
}

// return 'n' rounded up to the nearest power of 2
template <typename T>
constexpr T RoundUpPow2(T n) {
  static_assert(std::is_integral_v<T>);
  static_assert(std::is_unsigned_v<T>);
  if (n == 0) {
    return 2;
  } else if (IsPow2(n)) {
    return n;
  } else {
    return T(1) << (Log2(n) + 1);
  }
}

// convenient utility functions for the ring buffer
namespace ring_buffer_utils {
// the datatype used to index into the ring buffer the choice here affects
// both the possible size of the ring buffer and the performance of the 
// device code (less bits is better for Fmax)
//using index_type = unsigned long long;
using index_type = unsigned int;

// This is used to mask the indices to wrap around before integer overflow.
// Assuming index_type is 64-bit, this is: 0x7FFFFFFFFFFFFFFF
constexpr index_type kIndexWrapMask = (std::numeric_limits<index_type>::max() >> 1);

// Assuming index_type is 64-bit, this is: 0x8000000000000000
constexpr index_type kIndexWrapMax = (kIndexWrapMask + 1);

// Limit the size of the buffer to ensure that the write_idx can never catch
// the read_idx; the buffer will fill up first.
constexpr index_type kBufferSizeMax = kIndexWrapMax / 2;

// Mask the index by the (true) buffer bounds. We guarantee that buffer_size is
// a power of 2, so this is a cheaper version of idx % buffer_size.
inline index_type buffer_mask(index_type idx, index_type buffer_size) {
  return (idx & (buffer_size-1));
}

// get the number of elements in the buffer
inline index_type buffer_count(index_type write_idx, index_type read_idx) {
  // Technically, write_idx - read_idx could work here if we assume wrapping
  // around to 0 on overflow/underflow (undefined C++, architecture dependant).
  // We won't assume and will take care.
  return (write_idx >= read_idx) ? (write_idx - read_idx) : 
                                   (read_idx - kIndexWrapMax) + write_idx;
}

inline bool buffer_empty(index_type write_idx, index_type read_idx) {
  return write_idx == read_idx;
}

inline bool buffer_full(index_type write_idx, index_type read_idx, index_type buffer_size) {
  return buffer_count(write_idx, read_idx) == buffer_size;
}

// Move to the next index. Since kMaxBufferSize is guaranteed to be a
// power of two, adding to it and masking with kIndexWrapMax-1 (i.e. 
// kIndexWrapMask) is equivalent to % kIndexWrapMax.
inline index_type buffer_next_idx(index_type idx) {
  return (idx + 1) & kIndexWrapMask;
}

// Same as above but with a value that is more than one.
// We use separate functions to improve performance on the FPGA.
inline index_type buffer_next_idx(index_type idx, index_type delta) {
  return (idx + delta) & kIndexWrapMask;
}
}  // namespace ring_buffer_utils
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// Host Pipe Implementation
using index_type = ring_buffer_utils::index_type;

// Declare these out of the host_pipe to reduce name mangling
// TODO: guard these with namespaces?
template<typename Id>
class HostPipePipeId;
template<typename Id>
class HostPipeKernelId;


template <typename Id_, typename T_, size_t min_capacity_, int32_t ready_latency_, int32_t bits_per_symbol_, 
          bool uses_valid_, bool first_symbol_in_high_order_bits_,
          sycl::ext::intel::prototype::internal::protocol_name protocol_>
struct host_pipe_impl {
  // no objects exist, so delete these
  host_pipe_impl(const host_pipe_impl&) = delete;
  host_pipe_impl& operator=(host_pipe_impl const&) = delete;

  // for convenience, expose these to the user
  using value_type = T_;
  static constexpr size_t min_capacity = min_capacity_;

  // the SYCL pipe
  using Pipe = sycl::ext::intel::prototype::internal::pipe<HostPipePipeId<Id_>, T_, min_capacity_, ready_latency_, 
               bits_per_symbol_, uses_valid_, first_symbol_in_high_order_bits_,
               protocol_>;

  // HOST: non-blocking read
  static T_ read(sycl::queue& q, bool &success_code) {
#if __SYCL_DEVICE_ONLY__ == 1
    //static_assert(false, "read(queue&, bool&) cannot be called from device code");
#endif

    // initialize, host is reading so host_writable = false
    init<false>(q);

    // get the current write and read indices
    // Currently, this is the only function updating write_idx_local so we could
    // cache it locally, but to be safe let's load it every time.
    // For the read index, it is being updated by the FPGA, so do an atomic
    // load with acquire memory order - I am pretty sure this is correct, but
    // we can think about it some more.
    index_type write_idx_local = __atomic_load_n(write_idx_, __ATOMIC_ACQUIRE);
    index_type read_idx_local = __atomic_load_n(read_idx_, __ATOMIC_ACQUIRE);

    // workaround: nudges scheduler to make progress in runtime
    d2h_kernel_event.get_info<sycl::info::event::command_execution_status>();

    if (ring_buffer_utils::buffer_empty(write_idx_local, read_idx_local)) {
      // buffer is empty, nothing to read, this request failed
      success_code = false;
      return T_();
    } else {
      // get the index into the ring buffer
      index_type read_idx_local_masked = ring_buffer_utils::buffer_mask(read_idx_local, buffer_size_);

      // read the data from the buffer
      T_ ret = ring_buffer_[read_idx_local_masked];

      // update the read index, but make sure we have the correct memory order
      // I want to make sure the previous read is done before updating this
      // value, but I am not 100% on the correct memory order, so being safe
      // for now with SEQ_CST
      auto next_read_idx = ring_buffer_utils::buffer_next_idx(read_idx_local);
      __atomic_store_n(read_idx_, next_read_idx, __ATOMIC_SEQ_CST);

      // write succeeded, return data
      success_code = true;
      return ret;
    }
  }

  // HOST: blocking read
  static T_ read(sycl::queue& q) {
#if __SYCL_DEVICE_ONLY__ == 1
    //static_assert(false, "read(queue&) cannot be called from device code");
#endif
    bool valid;
    T_ ret;
    do { ret = read(q, valid); } while (!valid);
    return ret;
  }

  // DEVICE: non-blocking read
  static T_ read(bool &success_code) {
#if __SYCL_DEVICE_ONLY__ == 0
    //static_assert(false, "read(bool&) cannot be called from host code");
#endif
    return Pipe::read(success_code);
  }

  // DEVICE: blocking read
  static T_ read() {
#if __SYCL_DEVICE_ONLY__ == 0
    //static_assert(false, "read() cannot be called from host code");
#endif
    return Pipe::read();
  }

  // HOST: non-blocking write 
  static void write(sycl::queue& q, const T_ &data, bool &success_code) {
#if __SYCL_DEVICE_ONLY__ == 1
    //static_assert(false, "write(queue&, const T_&, bool&) cannot be called from device code");
#endif

    // initialize, host is writing so host_writable = false
    init<true>(q);
    
    // get the current write and read indices
    // see the comment above in the read(queue&, bool&) function for some more
    // discussion
    index_type write_idx_local = __atomic_load_n(write_idx_, __ATOMIC_ACQUIRE);
    index_type read_idx_local = __atomic_load_n(read_idx_, __ATOMIC_ACQUIRE);

    // workaround: nudges scheduler to make progress in runtime
    h2d_kernel_event.get_info<sycl::info::event::command_execution_status>();

    if (ring_buffer_utils::buffer_full(write_idx_local, read_idx_local, buffer_size_)) {
      // buffer is full, no where to write, so it fails
      success_code = false;
    } else {
      // space in the buffer, write to it in the host
      // mask the write index to get the true index into the buffer
      index_type write_idx_local_masked = ring_buffer_utils::buffer_mask(write_idx_local, buffer_size_);

      // write the data
      ring_buffer_[write_idx_local_masked] = data;

      // update the write index, but make sure we have the correct memory order
      // w.r.t. the writing of the data on the previous line
      // with C++20 or proper SYCL support, this would be an atomic_ref, but
      // we have neither, so best alternative I could find was the attributes
      auto next_write_idx = ring_buffer_utils::buffer_next_idx(write_idx_local);
      __atomic_store_n(write_idx_, next_write_idx, __ATOMIC_RELEASE);

      // write succeeded
      success_code = true;
    }
  }

  // HOST: blocking write 
  static void write(sycl::queue& q, const T_ &data) {
#if __SYCL_DEVICE_ONLY__ == 1
    //static_assert(false, "write(queue&, const T_&) cannot be called from device code");
#endif
    // simply keep trying to write
    bool valid;
    do { write(q, data, valid); } while (!valid);
  }

  // DEVICE: non-blocking write
  static void write(const T_ &data, bool &success_code) {
#if not defined(__SYCL_DEVICE_ONLY__)
    //static_assert(false, "write(const T_&, bool&) cannot be called from host code");
#endif
    Pipe::write(data, success_code);
  }

  // DEVICE: blocking write
  static void write(const T_ &data) {
#if not defined(__SYCL_DEVICE_ONLY__)
    //static_assert(false, "write(const T_&) cannot be called from host code");
#endif
    Pipe::write(data);
  }


private:
  // private constructor to stop object creation
  host_pipe_impl();

  //
  // Function to initialize the host pipe. Used for lazy initialization and
  // will only actually initialize once (on the first call).
  // The template argument is the direction of the host pipe.
  //    host_writeable == true ? host to device pipe
  //    host_writeable == false ? device to host pipe
  //
  template <bool host_writeable>
  static void init(sycl::queue& q) {
    // only initialize once
    static bool initialized = false;

    if (!initialized) {
      buffer_size_ = RoundUpPow2(min_capacity_);

      // USM host allocations
      // TODO: these allocations are being leaked.
      if ((ring_buffer_ = sycl::malloc_host<T_>(buffer_size_, q)) == nullptr) {
        std::cerr << "ERROR: could not allocate space for 'ring_buffer_'\n";
        std::terminate();
      }
      if ((write_idx_ = sycl::malloc_host<index_type>(1, q)) == nullptr) {
        std::cerr << "ERROR: could not allocate space for 'write_idx_'\n";
        std::terminate();
      }
      if ((read_idx_ = sycl::malloc_host<index_type>(1, q)) == nullptr) {
        std::cerr << "ERROR: could not allocate space for 'read_idx_'\n";
        std::terminate();
      }

      // initialize write and read indices pointers 0
      __atomic_store_n(write_idx_, 0, __ATOMIC_RELEASE);
      __atomic_store_n(read_idx_, 0, __ATOMIC_RELEASE);

      // launch the ring buffer kernel
      // notice the constexpr if, this is useful so that we only create 1
      // kernel per host pipe. We could launch both kernels for every host pipe
      // but that would be wasteful
      if constexpr(host_writeable) {
        h2d_kernel_event = launch_ring_buffer_kernel_h2d(q, ring_buffer_, buffer_size_, write_idx_, read_idx_);
      } else {
        d2h_kernel_event = launch_ring_buffer_kernel_d2h(q, ring_buffer_, buffer_size_, write_idx_, read_idx_);
      }

      // don't initialize again
      initialized = true;
    }
  }

  //
  // HOST-TO-DEVICE: This function launches the kernel that reads data
  // from the ring buffer and writes it to the SYCL pipe.
  //
  static sycl::event launch_ring_buffer_kernel_h2d(sycl::queue& q,
                                            T_ *buffer_ptr,
                                            index_type buffer_size,
                                            volatile index_type *write_idx_ptr,
                                            volatile index_type *read_idx_ptr) {
    // This kernel implements the host->device ring buffer on the device side
    // i.e. host is producing data to write_idx, device is consuming data from 
    // read_idx (and the writing that data into the kernel system via 'Pipe')
    return q.single_task<HostPipeKernelId<Id_>>([=]() [[intel::kernel_args_restrict]] {
      // annotated host pointers
      sycl::ext::intel::host_ptr<T_> buffer(buffer_ptr);
      sycl::ext::intel::host_ptr<volatile index_type> write_idx(write_idx_ptr);
      sycl::ext::intel::host_ptr<volatile index_type> read_idx(read_idx_ptr);

      // the read_idx is updated by the device, store it locally and only
      // send updates to the host (through 'read_idx' host pointer) when
      // we 'want' to (see below)
      index_type local_read_idx = 0;

      // TODO: sideband signal (kernel + pipe) to terminate this loop?
      [[intel::disable_loop_pipelining]]
      while (true) {
        // take a snapshot of the current state of the ring buffer's
        // write index (updated by the host, in this case).
        // TODO: When we have proper support, write_idx should be an atomic_ref
        // and this load should be an atomic load with acquire memory order.
        index_type curr_write_idx = *write_idx;

        atomic_fence(sycl::memory_order::seq_cst, sycl::memory_scope::device);

        // read data from the host until the buffer (in its current snapshotted
        // state) is empty
        // TODO: use a while-loop here to minimize writes to read_idx but need
        // a "breakout" counter to make sure we don't get stuck.
        if (!ring_buffer_utils::buffer_empty(curr_write_idx, local_read_idx)) {
          // TODO: use LSU attributes to ensure we don't get a caching LSU
          // when LSU attributes support host_ptrs
          auto data = *(buffer + ring_buffer_utils::buffer_mask(local_read_idx, buffer_size));
          
          // write to the SYCL pipe to produce the value into the kernel system
          Pipe::write(data);

          // move to next read index
          local_read_idx = ring_buffer_utils::buffer_next_idx(local_read_idx);
        }

        // make sure we are done doing the reads before updating read_idx
        atomic_fence(sycl::memory_order::seq_cst, sycl::memory_scope::device);

        // update the read_idx pointer
        // TODO: when we have proper support, we should drop the fence and use
        // atomic_ref with system scope and release memory order(?)
        *read_idx = local_read_idx;
      }
    });
  }

  //
  // DEVICE-TO-HOST: This function launches the kernel that reads data
  // from the SYCL pipe and writes it to the ring buffer.
  //
  static sycl::event launch_ring_buffer_kernel_d2h(sycl::queue& q,
                                            T_ *buffer_ptr,
                                            index_type buffer_size,
                                            volatile index_type *write_idx_ptr,
                                            volatile index_type *read_idx_ptr) {
    // This kernel implements the device->host ring buffer on the device side
    // i.e. The kernel system produces data to 'Pipe' and this kernel
    // writes to the ring buffer at 'write_idx' while the host consume data
    // from the 'read_idx'
    return q.single_task<HostPipeKernelId<Id_>>([=]() [[intel::kernel_args_restrict]] {
      sycl::ext::intel::host_ptr<T_> buffer(buffer_ptr);
      sycl::ext::intel::host_ptr<volatile index_type> write_idx(write_idx_ptr);
      sycl::ext::intel::host_ptr<volatile index_type> read_idx(read_idx_ptr);

      // this kernel is writing to the ring buffer, so track the write_idx
      // locally and we will periodically update the write_idx in host memory
      index_type local_write_idx = 0;

      // TODO: sideband signal (kernel + pipe) to terminate this loop?
      [[intel::disable_loop_pipelining]]
      while (true) {
        // take a snapshot of the current state of the ring buffer's
        // read index (updated by the host, in this case).
        // TODO: When we have proper support, write_idx should be an atomic_ref
        // and this load should be an atomic load with acquire memory order.
        index_type curr_read_idx = *read_idx;

        atomic_fence(sycl::memory_order::seq_cst, sycl::memory_scope::device);

        // read data from the input pipe and write to the ring buffer if there
        // is room.
        // TODO: use a while-loop here to minimize writes to write_idx but need
        // a "breakout" counter to make sure we don't get stuck.
        if (!ring_buffer_utils::buffer_full(local_write_idx, curr_read_idx, buffer_size)) {
          // read from the input pipe
          bool pipe_valid;
          auto data = Pipe::read(pipe_valid);

          if (pipe_valid) {
            // write to the ring buffer, masking the write_idx gets the 'true'
            // index into the buffer.
            *(buffer + ring_buffer_utils::buffer_mask(local_write_idx, buffer_size)) = data;

            // increment the write_idx locally
            local_write_idx = ring_buffer_utils::buffer_next_idx(local_write_idx);;
          }
        }

        // make sure the previous writes (to buffer) have committed before
        // writing to write_idx
        atomic_fence(sycl::memory_order::seq_cst, sycl::memory_scope::device);

        // update the write_idx
        // TODO: when we have proper support, we should drop the fence and use
        // atomic_ref with system scope and release memory order
        // TODO: check if it has changed to limit expensive writes
        *write_idx = local_write_idx;
      }
    });
  }

  // ring buffer variables
  static inline T_ *ring_buffer_{nullptr};
  static inline index_type *write_idx_{nullptr};
  static inline index_type *read_idx_{nullptr};
  static inline index_type buffer_size_{};

  // workaround: used to make progress in runtime
  static inline sycl::event h2d_kernel_event;
  static inline sycl::event d2h_kernel_event;
};
}  // namespace detail
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// Host Pipe
namespace sycl {
namespace ext {
namespace intel {
namespace prototype {
  template <typename Id_, typename T_, size_t min_capacity_, int32_t ready_latency_=0, 
            int32_t bits_per_symbol_=1, bool uses_valid_=true, bool first_symbol_in_high_order_bits_=false, 
            internal::protocol_name protocol_=internal::protocol_name::AVALON_STREAMING>
  using pipe = ::detail::host_pipe_impl<Id_, T_, min_capacity_, ready_latency_, bits_per_symbol_,
                                        uses_valid_, first_symbol_in_high_order_bits_, protocol_>;
}  // namespace prototype
}  // namespace intel
}  // namespace ext
}  // namespace sycl
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////

#endif /* __HOST_PIPES_HPP__ */
