use std::{
    collections::HashMap,
    fmt::Debug,
    iter::FusedIterator,
    sync::{atomic::AtomicU64, Arc, Mutex, RwLock, RwLockReadGuard, TryLockError, Weak},
};

use bytemuck::BoxBytes;
use itertools::Itertools;
use tracing::warn;

use crate::emulator::{EmulatorClient, EmulatorCommand, SimId};

pub struct MemoryClient {
    client: EmulatorClient,
    regions: Mutex<HashMap<MemoryRange, Weak<MemoryRegion>>>,
}

impl MemoryClient {
    pub fn new(client: EmulatorClient) -> Self {
        Self {
            client,
            regions: Mutex::new(HashMap::new()),
        }
    }

    pub fn watch(&self, sim: SimId, start: u32, length: usize) -> MemoryView {
        let range = MemoryRange { sim, start, length };
        let mut regions = self.regions.lock().unwrap_or_else(|e| e.into_inner());
        let region = regions
            .get(&range)
            .and_then(|r| r.upgrade())
            .unwrap_or_else(|| {
                let region = Arc::new(MemoryRegion::new(start, length));
                regions.insert(range, Arc::downgrade(&region));
                self.client
                    .send_command(EmulatorCommand::WatchMemory(range, Arc::downgrade(&region)));
                region
            });
        MemoryView { region }
    }

    pub fn write<T: MemoryValue>(&self, sim: SimId, address: u32, data: &T) {
        let mut buffer = vec![];
        data.to_bytes(&mut buffer);
        let (tx, _) = oneshot::channel();
        self.client
            .send_command(EmulatorCommand::WriteMemory(sim, address, buffer, tx));
    }
}

fn aligned_memory(start: u32, length: usize) -> BoxBytes {
    if start % 4 == 0 && length % 4 == 0 {
        let memory = vec![0u32; length / 4].into_boxed_slice();
        return bytemuck::box_bytes_of(memory);
    }
    if start % 2 == 0 && length % 2 == 0 {
        let memory = vec![0u16; length / 2].into_boxed_slice();
        return bytemuck::box_bytes_of(memory);
    }
    let memory = vec![0u8; length].into_boxed_slice();
    bytemuck::box_bytes_of(memory)
}

pub struct MemoryView {
    region: Arc<MemoryRegion>,
}

impl MemoryView {
    pub fn borrow(&self) -> MemoryRef<'_> {
        self.region.borrow()
    }
}

pub struct MemoryRef<'a> {
    inner: RwLockReadGuard<'a, BoxBytes>,
}

pub trait MemoryValue {
    fn from_bytes(bytes: &[u8]) -> Self;
    fn to_bytes(&self, buffer: &mut Vec<u8>);
}

macro_rules! primitive_memory_value_impl {
    ($T:ty, $L: expr) => {
        impl MemoryValue for $T {
            #[inline]
            fn from_bytes(bytes: &[u8]) -> Self {
                let bytes: [u8; std::mem::size_of::<$T>()] = std::array::from_fn(|i| bytes[i]);
                <$T>::from_le_bytes(bytes)
            }
            #[inline]
            fn to_bytes(&self, buffer: &mut Vec<u8>) {
                buffer.extend_from_slice(&self.to_le_bytes())
            }
        }
    };
}

primitive_memory_value_impl!(u8, 1);
primitive_memory_value_impl!(u16, 2);
primitive_memory_value_impl!(u32, 4);

impl<const N: usize, T: MemoryValue> MemoryValue for [T; N] {
    #[inline]
    fn from_bytes(bytes: &[u8]) -> Self {
        std::array::from_fn(|i| {
            T::from_bytes(&bytes[i * std::mem::size_of::<T>()..(i + 1) * std::mem::size_of::<T>()])
        })
    }
    #[inline]
    fn to_bytes(&self, buffer: &mut Vec<u8>) {
        for item in self {
            item.to_bytes(buffer);
        }
    }
}

pub struct MemoryIter<'a, T> {
    bytes: &'a [u8],
    _phantom: std::marker::PhantomData<T>,
}

impl<'a, T> MemoryIter<'a, T> {
    fn new(bytes: &'a [u8]) -> Self {
        Self {
            bytes,
            _phantom: std::marker::PhantomData,
        }
    }
}

impl<T: MemoryValue> Iterator for MemoryIter<'_, T> {
    type Item = T;

    #[inline]
    fn next(&mut self) -> Option<Self::Item> {
        let (bytes, rest) = self.bytes.split_at_checked(std::mem::size_of::<T>())?;
        self.bytes = rest;
        Some(T::from_bytes(bytes))
    }

    #[inline]
    fn size_hint(&self) -> (usize, Option<usize>) {
        let size = self.bytes.len() / std::mem::size_of::<T>();
        (size, Some(size))
    }
}

impl<T: MemoryValue> DoubleEndedIterator for MemoryIter<'_, T> {
    fn next_back(&mut self) -> Option<Self::Item> {
        let mid = self.bytes.len().checked_sub(std::mem::size_of::<T>())?;
        // SAFETY: the checked_sub above is effectively a bounds check
        let (rest, bytes) = unsafe { self.bytes.split_at_unchecked(mid) };
        self.bytes = rest;
        Some(T::from_bytes(bytes))
    }
}

impl<T: MemoryValue> FusedIterator for MemoryIter<'_, T> {}

impl MemoryRef<'_> {
    pub fn read<T: MemoryValue>(&self, index: usize) -> T {
        let from = index * size_of::<T>();
        let to = from + size_of::<T>();
        T::from_bytes(&self.inner[from..to])
    }

    pub fn range<T: MemoryValue>(&self, start: usize, count: usize) -> MemoryIter<T> {
        let from = start * size_of::<T>();
        let to = from + (count * size_of::<T>());
        MemoryIter::new(&self.inner[from..to])
    }
}

#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub struct MemoryRange {
    pub sim: SimId,
    pub start: u32,
    pub length: usize,
}

const BUFFERS: usize = 4;
pub struct MemoryRegion {
    gens: [AtomicU64; BUFFERS],
    bufs: [RwLock<BoxBytes>; BUFFERS],
}
impl Debug for MemoryRegion {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("MemoryRegion")
            .field("gens", &self.gens)
            .finish_non_exhaustive()
    }
}
// SAFETY: BoxBytes is meant to be Send+Sync, will be in a future version
unsafe impl Send for MemoryRegion {}
// SAFETY: BoxBytes is meant to be Send+Sync, will be in a future version
unsafe impl Sync for MemoryRegion {}

impl MemoryRegion {
    fn new(start: u32, length: usize) -> Self {
        Self {
            gens: std::array::from_fn(|i| AtomicU64::new(i as u64)),
            bufs: std::array::from_fn(|_| RwLock::new(aligned_memory(start, length))),
        }
    }

    pub fn borrow(&self) -> MemoryRef<'_> {
        /*
         * When reading memory, a thread will grab the newest buffer (with the highest gen)
         * It will only fail to grab the lock if the writer already has it,
         * but the writer prioritizes older buffers (with lower gens).
         * So this method will only block if the writer produces three full buffers
         * in the time it takes the reader to do four atomic reads and grab a lock.
         * In the unlikely event this happens... just try again.
         */
        loop {
            let newest_index = self
                .gens
                .iter()
                .map(|i| i.load(std::sync::atomic::Ordering::Acquire))
                .enumerate()
                .max_by_key(|(_, gen)| *gen)
                .map(|(i, _)| i)
                .unwrap();
            let inner = match self.bufs[newest_index].try_read() {
                Ok(inner) => inner,
                Err(TryLockError::Poisoned(e)) => e.into_inner(),
                Err(TryLockError::WouldBlock) => {
                    continue;
                }
            };
            break MemoryRef { inner };
        }
    }

    pub fn update(&self, data: &[u8]) {
        let gens: Vec<u64> = self
            .gens
            .iter()
            .map(|i| i.load(std::sync::atomic::Ordering::Acquire))
            .collect();
        let next_gen = gens.iter().max().unwrap() + 1;
        let indices = gens
            .into_iter()
            .enumerate()
            .sorted_by_key(|(_, val)| *val)
            .map(|(i, _)| i);
        for index in indices {
            let mut lock = match self.bufs[index].try_write() {
                Ok(inner) => inner,
                Err(TryLockError::Poisoned(e)) => e.into_inner(),
                Err(TryLockError::WouldBlock) => {
                    continue;
                }
            };
            lock.copy_from_slice(data);
            self.gens[index].store(next_gen, std::sync::atomic::Ordering::Release);
            return;
        }
        /*
         * We have four buffers, and (at time of writing) only three threads interacting with memory:
         * - The UI thread, reading small regions of memory
         * - The "vram renderer" thread, reading large regions of memory
         * - The emulation thread, writing memory every so often
         * So it should be impossible for all four buffers to have a read lock at the same time,
         *  and (because readers always read the newest buffer) at least one of the oldest three
         *  buffers will be free the entire time we're in this method.
         * TL;DR this should never happen.
         * But if it does, do nothing. This isn't medical software, better to show stale data than crash.
         */
        warn!("all buffers were locked by a reader at the same time")
    }
}