"""
GPU-Accelerated Geometry Primitives Library

Uses CuPy for CUDA-accelerated image transforms.
Falls back to CPU if GPU unavailable.

Performance Mode:
- Set STREAMING_GPU_PERSIST=1 to keep frames on GPU between operations
- This dramatically improves performance by avoiding CPU<->GPU transfers
- Frames only transfer to CPU at final output
"""
import os
import numpy as np

# Try to import CuPy for GPU acceleration
try:
    import cupy as cp
    from cupyx.scipy import ndimage as cpndimage
    GPU_AVAILABLE = True
    print("[geometry_gpu] CuPy GPU acceleration enabled")
except ImportError:
    cp = np
    GPU_AVAILABLE = False
    print("[geometry_gpu] CuPy not available, using CPU fallback")

# GPU persistence mode - keep frames on GPU between operations
# Set STREAMING_GPU_PERSIST=1 for maximum performance
GPU_PERSIST = os.environ.get("STREAMING_GPU_PERSIST", "1") == "1"
if GPU_AVAILABLE and GPU_PERSIST:
    print("[geometry_gpu] GPU persistence enabled - frames stay on GPU")


def _to_gpu(img):
    """Move image to GPU if available."""
    if GPU_AVAILABLE and not isinstance(img, cp.ndarray):
        return cp.asarray(img)
    return img


def _to_cpu(img):
    """Move image back to CPU (only if GPU_PERSIST is disabled)."""
    if not GPU_PERSIST and GPU_AVAILABLE and isinstance(img, cp.ndarray):
        return cp.asnumpy(img)
    return img


def _ensure_output_format(img):
    """Ensure output is in correct format based on GPU_PERSIST setting."""
    return _to_cpu(img)


def prim_rotate(img, angle, cx=None, cy=None):
    """Rotate image by angle degrees around center (cx, cy)."""
    if not GPU_AVAILABLE:
        # Fallback to OpenCV
        import cv2
        h, w = img.shape[:2]
        if cx is None:
            cx = w / 2
        if cy is None:
            cy = h / 2
        M = cv2.getRotationMatrix2D((cx, cy), angle, 1.0)
        return cv2.warpAffine(img, M, (w, h))

    img_gpu = _to_gpu(img)
    h, w = img_gpu.shape[:2]

    if cx is None:
        cx = w / 2
    if cy is None:
        cy = h / 2

    # Use cupyx.scipy.ndimage.rotate
    # Note: scipy uses different angle convention
    rotated = cpndimage.rotate(img_gpu, angle, reshape=False, order=1)

    return _to_cpu(rotated)


def prim_scale(img, sx, sy, cx=None, cy=None):
    """Scale image by (sx, sy) around center (cx, cy)."""
    if not GPU_AVAILABLE:
        import cv2
        h, w = img.shape[:2]
        if cx is None:
            cx = w / 2
        if cy is None:
            cy = h / 2
        M = np.float32([
            [sx, 0, cx * (1 - sx)],
            [0, sy, cy * (1 - sy)]
        ])
        return cv2.warpAffine(img, M, (w, h))

    img_gpu = _to_gpu(img)
    h, w = img_gpu.shape[:2]

    if cx is None:
        cx = w / 2
    if cy is None:
        cy = h / 2

    # Use cupyx.scipy.ndimage.zoom
    if img_gpu.ndim == 3:
        zoom_factors = (sy, sx, 1)  # Don't zoom color channels
    else:
        zoom_factors = (sy, sx)

    zoomed = cpndimage.zoom(img_gpu, zoom_factors, order=1)

    # Crop/pad to original size
    zh, zw = zoomed.shape[:2]
    result = cp.zeros_like(img_gpu)

    # Calculate offsets
    src_y = max(0, (zh - h) // 2)
    src_x = max(0, (zw - w) // 2)
    dst_y = max(0, (h - zh) // 2)
    dst_x = max(0, (w - zw) // 2)

    copy_h = min(h - dst_y, zh - src_y)
    copy_w = min(w - dst_x, zw - src_x)

    result[dst_y:dst_y+copy_h, dst_x:dst_x+copy_w] = zoomed[src_y:src_y+copy_h, src_x:src_x+copy_w]

    return _to_cpu(result)


def prim_translate(img, dx, dy):
    """Translate image by (dx, dy) pixels."""
    if not GPU_AVAILABLE:
        import cv2
        h, w = img.shape[:2]
        M = np.float32([[1, 0, dx], [0, 1, dy]])
        return cv2.warpAffine(img, M, (w, h))

    img_gpu = _to_gpu(img)
    # Use cupyx.scipy.ndimage.shift
    if img_gpu.ndim == 3:
        shift = (dy, dx, 0)  # Don't shift color channels
    else:
        shift = (dy, dx)

    shifted = cpndimage.shift(img_gpu, shift, order=1)
    return _to_cpu(shifted)


def prim_flip_h(img):
    """Flip image horizontally."""
    if GPU_AVAILABLE:
        img_gpu = _to_gpu(img)
        return _to_cpu(cp.flip(img_gpu, axis=1))
    return np.flip(img, axis=1)


def prim_flip_v(img):
    """Flip image vertically."""
    if GPU_AVAILABLE:
        img_gpu = _to_gpu(img)
        return _to_cpu(cp.flip(img_gpu, axis=0))
    return np.flip(img, axis=0)


def prim_flip(img, direction="horizontal"):
    """Flip image in given direction."""
    if direction in ("horizontal", "h"):
        return prim_flip_h(img)
    elif direction in ("vertical", "v"):
        return prim_flip_v(img)
    elif direction in ("both", "hv", "vh"):
        if GPU_AVAILABLE:
            img_gpu = _to_gpu(img)
            return _to_cpu(cp.flip(cp.flip(img_gpu, axis=0), axis=1))
        return np.flip(np.flip(img, axis=0), axis=1)
    return img


# CUDA kernel for ripple effect
if GPU_AVAILABLE:
    _ripple_kernel = cp.RawKernel(r'''
    extern "C" __global__
    void ripple(const unsigned char* src, unsigned char* dst,
                int width, int height, int channels,
                float amplitude, float frequency, float decay,
                float speed, float time, float cx, float cy) {
        int x = blockDim.x * blockIdx.x + threadIdx.x;
        int y = blockDim.y * blockIdx.y + threadIdx.y;

        if (x >= width || y >= height) return;

        // Distance from center
        float dx = x - cx;
        float dy = y - cy;
        float dist = sqrtf(dx * dx + dy * dy);

        // Ripple displacement
        float wave = sinf(dist * frequency * 0.1f - time * speed) * amplitude;
        float falloff = expf(-dist * decay * 0.01f);
        float displacement = wave * falloff;

        // Direction from center
        float len = dist + 0.0001f;  // Avoid division by zero
        float dir_x = dx / len;
        float dir_y = dy / len;

        // Source coordinates
        float src_x = x - dir_x * displacement;
        float src_y = y - dir_y * displacement;

        // Clamp to bounds
        src_x = fmaxf(0.0f, fminf(width - 1.0f, src_x));
        src_y = fmaxf(0.0f, fminf(height - 1.0f, src_y));

        // Bilinear interpolation
        int x0 = (int)src_x;
        int y0 = (int)src_y;
        int x1 = min(x0 + 1, width - 1);
        int y1 = min(y0 + 1, height - 1);

        float fx = src_x - x0;
        float fy = src_y - y0;

        for (int c = 0; c < channels; c++) {
            float v00 = src[(y0 * width + x0) * channels + c];
            float v10 = src[(y0 * width + x1) * channels + c];
            float v01 = src[(y1 * width + x0) * channels + c];
            float v11 = src[(y1 * width + x1) * channels + c];

            float v0 = v00 * (1 - fx) + v10 * fx;
            float v1 = v01 * (1 - fx) + v11 * fx;
            float val = v0 * (1 - fy) + v1 * fy;

            dst[(y * width + x) * channels + c] = (unsigned char)fminf(255.0f, fmaxf(0.0f, val));
        }
    }
    ''', 'ripple')


def prim_ripple(img, amplitude=10.0, frequency=8.0, decay=2.0, speed=5.0,
                time=0.0, center_x=None, center_y=None):
    """Apply ripple distortion effect."""
    h, w = img.shape[:2]
    channels = img.shape[2] if img.ndim == 3 else 1

    if center_x is None:
        center_x = w / 2
    if center_y is None:
        center_y = h / 2

    if not GPU_AVAILABLE:
        # CPU fallback using coordinate mapping
        import cv2
        y_coords, x_coords = np.mgrid[0:h, 0:w].astype(np.float32)

        dx = x_coords - center_x
        dy = y_coords - center_y
        dist = np.sqrt(dx**2 + dy**2)

        wave = np.sin(dist * frequency * 0.1 - time * speed) * amplitude
        falloff = np.exp(-dist * decay * 0.01)
        displacement = wave * falloff

        length = dist + 0.0001
        dir_x = dx / length
        dir_y = dy / length

        map_x = (x_coords - dir_x * displacement).astype(np.float32)
        map_y = (y_coords - dir_y * displacement).astype(np.float32)

        return cv2.remap(img, map_x, map_y, cv2.INTER_LINEAR)

    # GPU implementation
    img_gpu = _to_gpu(img.astype(np.uint8))
    if img_gpu.ndim == 2:
        img_gpu = img_gpu[:, :, cp.newaxis]
        channels = 1

    dst = cp.zeros_like(img_gpu)

    block = (16, 16)
    grid = ((w + block[0] - 1) // block[0], (h + block[1] - 1) // block[1])

    _ripple_kernel(grid, block, (
        img_gpu, dst,
        np.int32(w), np.int32(h), np.int32(channels),
        np.float32(amplitude), np.float32(frequency), np.float32(decay),
        np.float32(speed), np.float32(time),
        np.float32(center_x), np.float32(center_y)
    ))

    result = _to_cpu(dst)
    if channels == 1:
        result = result[:, :, 0]
    return result


# CUDA kernel for fast rotation with bilinear interpolation
if GPU_AVAILABLE:
    _rotate_kernel = cp.RawKernel(r'''
    extern "C" __global__
    void rotate_img(const unsigned char* src, unsigned char* dst,
                    int width, int height, int channels,
                    float cos_a, float sin_a, float cx, float cy) {
        int x = blockDim.x * blockIdx.x + threadIdx.x;
        int y = blockDim.y * blockIdx.y + threadIdx.y;

        if (x >= width || y >= height) return;

        // Translate to center, rotate, translate back
        float dx = x - cx;
        float dy = y - cy;

        float src_x = cos_a * dx + sin_a * dy + cx;
        float src_y = -sin_a * dx + cos_a * dy + cy;

        // Check bounds
        if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) {
            for (int c = 0; c < channels; c++) {
                dst[(y * width + x) * channels + c] = 0;
            }
            return;
        }

        // Bilinear interpolation
        int x0 = (int)src_x;
        int y0 = (int)src_y;
        int x1 = x0 + 1;
        int y1 = y0 + 1;

        float fx = src_x - x0;
        float fy = src_y - y0;

        for (int c = 0; c < channels; c++) {
            float v00 = src[(y0 * width + x0) * channels + c];
            float v10 = src[(y0 * width + x1) * channels + c];
            float v01 = src[(y1 * width + x0) * channels + c];
            float v11 = src[(y1 * width + x1) * channels + c];

            float v0 = v00 * (1 - fx) + v10 * fx;
            float v1 = v01 * (1 - fx) + v11 * fx;
            float val = v0 * (1 - fy) + v1 * fy;

            dst[(y * width + x) * channels + c] = (unsigned char)fminf(255.0f, fmaxf(0.0f, val));
        }
    }
    ''', 'rotate_img')


def prim_rotate_gpu(img, angle, cx=None, cy=None):
    """Fast GPU rotation using custom CUDA kernel."""
    if not GPU_AVAILABLE:
        return prim_rotate(img, angle, cx, cy)

    h, w = img.shape[:2]
    channels = img.shape[2] if img.ndim == 3 else 1

    if cx is None:
        cx = w / 2
    if cy is None:
        cy = h / 2

    img_gpu = _to_gpu(img.astype(np.uint8))
    if img_gpu.ndim == 2:
        img_gpu = img_gpu[:, :, cp.newaxis]
        channels = 1

    dst = cp.zeros_like(img_gpu)

    # Convert angle to radians
    rad = np.radians(angle)
    cos_a = np.cos(rad)
    sin_a = np.sin(rad)

    block = (16, 16)
    grid = ((w + block[0] - 1) // block[0], (h + block[1] - 1) // block[1])

    _rotate_kernel(grid, block, (
        img_gpu, dst,
        np.int32(w), np.int32(h), np.int32(channels),
        np.float32(cos_a), np.float32(sin_a),
        np.float32(cx), np.float32(cy)
    ))

    result = _to_cpu(dst)
    if channels == 1:
        result = result[:, :, 0]
    return result


# Import CPU primitives as fallbacks for functions we don't GPU-accelerate
def _get_cpu_primitives():
    """Get all primitives from CPU geometry module as fallbacks."""
    from sexp_effects.primitive_libs import geometry
    return geometry.PRIMITIVES


# Export functions - start with CPU primitives, then override with GPU versions
PRIMITIVES = _get_cpu_primitives().copy()

# Override specific primitives with GPU-accelerated versions
PRIMITIVES.update({
    'translate': prim_translate,
    'rotate-img': prim_rotate_gpu if GPU_AVAILABLE else prim_rotate,
    'scale-img': prim_scale,
    'flip-h': prim_flip_h,
    'flip-v': prim_flip_v,
    'flip': prim_flip,
    # Note: ripple-displace uses CPU version (different API - returns coords, not image)
})