"""
Sexp to CUDA Kernel Compiler.

Compiles sexp frame pipelines to fused CUDA kernels for maximum performance.
Instead of interpreting sexp and launching 10+ kernels per frame,
generates a single kernel that does everything in one pass.
"""

import cupy as cp
import numpy as np
from typing import Dict, List, Any, Optional, Tuple
import hashlib
import sys

# Kernel cache
_COMPILED_KERNELS: Dict[str, Any] = {}


def compile_frame_pipeline(effects: List[dict], width: int, height: int) -> callable:
    """
    Compile a list of effects to a fused CUDA kernel.

    Args:
        effects: List of effect dicts like:
            [{'op': 'rotate', 'angle': 45.0},
             {'op': 'blend', 'alpha': 0.5, 'src2': <gpu_array>},
             {'op': 'hue_shift', 'degrees': 90.0},
             {'op': 'ripple', 'amplitude': 10.0, 'frequency': 8.0, ...}]
        width, height: Frame dimensions

    Returns:
        Callable that takes input frame and returns output frame
    """

    # Generate cache key
    ops_key = str([(e['op'], {k:v for k,v in e.items() if k != 'src2'}) for e in effects])
    cache_key = f"{width}x{height}_{hashlib.md5(ops_key.encode()).hexdigest()}"

    if cache_key in _COMPILED_KERNELS:
        return _COMPILED_KERNELS[cache_key]

    # Generate fused kernel code
    kernel_code = _generate_fused_kernel(effects, width, height)

    # Compile kernel
    kernel = cp.RawKernel(kernel_code, 'fused_pipeline')

    # Create wrapper function
    def run_pipeline(frame: cp.ndarray, **dynamic_params) -> cp.ndarray:
        """Run the compiled pipeline on a frame."""
        if frame.dtype != cp.uint8:
            frame = cp.clip(frame, 0, 255).astype(cp.uint8)
        if not frame.flags['C_CONTIGUOUS']:
            frame = cp.ascontiguousarray(frame)

        output = cp.zeros_like(frame)

        block = (16, 16)
        grid = ((width + 15) // 16, (height + 15) // 16)

        # Build parameter array
        params = _build_params(effects, dynamic_params)

        kernel(grid, block, (frame, output, width, height, params))

        return output

    _COMPILED_KERNELS[cache_key] = run_pipeline
    return run_pipeline


def _generate_fused_kernel(effects: List[dict], width: int, height: int) -> str:
    """Generate CUDA kernel code for fused effects pipeline."""

    # Build the kernel
    code = r'''
extern "C" __global__
void fused_pipeline(
    const unsigned char* src,
    unsigned char* dst,
    int width, int height,
    const float* params
) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= width || y >= height) return;

    // Start with source coordinates
    float src_x = (float)x;
    float src_y = (float)y;
    float cx = width / 2.0f;
    float cy = height / 2.0f;

    // Track accumulated transforms
    float total_cos = 1.0f, total_sin = 0.0f;  // rotation
    float total_zoom = 1.0f;  // zoom
    float ripple_dx = 0.0f, ripple_dy = 0.0f;  // ripple displacement

    int param_idx = 0;

'''

    # Add effect-specific code
    for i, effect in enumerate(effects):
        op = effect['op']

        if op == 'rotate':
            code += f'''
    // Rotate {i}
    {{
        float angle = params[param_idx++] * 3.14159265f / 180.0f;
        float c = cosf(angle);
        float s = sinf(angle);
        // Compose with existing rotation
        float nc = total_cos * c - total_sin * s;
        float ns = total_cos * s + total_sin * c;
        total_cos = nc;
        total_sin = ns;
    }}
'''
        elif op == 'zoom':
            code += f'''
    // Zoom {i}
    {{
        float zoom = params[param_idx++];
        total_zoom *= zoom;
    }}
'''
        elif op == 'ripple':
            code += f'''
    // Ripple {i}
    {{
        float amplitude = params[param_idx++];
        float frequency = params[param_idx++];
        float decay = params[param_idx++];
        float phase = params[param_idx++];
        float rcx = params[param_idx++];
        float rcy = params[param_idx++];

        float rdx = src_x - rcx;
        float rdy = src_y - rcy;
        float dist = sqrtf(rdx * rdx + rdy * rdy);

        float wave = sinf(dist * frequency * 0.1f + phase);
        float amp = amplitude * expf(-dist * decay * 0.01f);

        if (dist > 0.001f) {{
            ripple_dx += rdx / dist * wave * amp;
            ripple_dy += rdy / dist * wave * amp;
        }}
    }}
'''

    # Apply all geometric transforms at once
    code += '''
    // Apply accumulated geometric transforms
    {
        // Translate to center
        float dx = src_x - cx;
        float dy = src_y - cy;

        // Apply rotation
        float rx = total_cos * dx + total_sin * dy;
        float ry = -total_sin * dx + total_cos * dy;

        // Apply zoom (inverse for sampling)
        rx /= total_zoom;
        ry /= total_zoom;

        // Translate back and apply ripple
        src_x = rx + cx - ripple_dx;
        src_y = ry + cy - ripple_dy;
    }

    // Sample source with bilinear interpolation
    float r, g, b;
    if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) {
        r = g = b = 0;
    } else {
        int x0 = (int)src_x;
        int y0 = (int)src_y;
        float fx = src_x - x0;
        float fy = src_y - y0;

        int idx00 = (y0 * width + x0) * 3;
        int idx10 = (y0 * width + x0 + 1) * 3;
        int idx01 = ((y0 + 1) * width + x0) * 3;
        int idx11 = ((y0 + 1) * width + x0 + 1) * 3;

        #define BILERP(c) \\
            (src[idx00 + c] * (1-fx) * (1-fy) + \\
             src[idx10 + c] * fx * (1-fy) + \\
             src[idx01 + c] * (1-fx) * fy + \\
             src[idx11 + c] * fx * fy)

        r = BILERP(0);
        g = BILERP(1);
        b = BILERP(2);
    }

'''

    # Add color transforms
    for i, effect in enumerate(effects):
        op = effect['op']

        if op == 'invert':
            code += f'''
    // Invert {i}
    {{
        float amount = params[param_idx++];
        if (amount > 0.5f) {{
            r = 255.0f - r;
            g = 255.0f - g;
            b = 255.0f - b;
        }}
    }}
'''
        elif op == 'hue_shift':
            code += f'''
    // Hue shift {i}
    {{
        float shift = params[param_idx++];
        if (fabsf(shift) > 0.01f) {{
            // RGB to HSV
            float rf = r / 255.0f;
            float gf = g / 255.0f;
            float bf = b / 255.0f;

            float max_c = fmaxf(rf, fmaxf(gf, bf));
            float min_c = fminf(rf, fminf(gf, bf));
            float delta = max_c - min_c;

            float h = 0, s = 0, v = max_c;

            if (delta > 0.00001f) {{
                s = delta / max_c;
                if (rf >= max_c) h = (gf - bf) / delta;
                else if (gf >= max_c) h = 2.0f + (bf - rf) / delta;
                else h = 4.0f + (rf - gf) / delta;
                h *= 60.0f;
                if (h < 0) h += 360.0f;
            }}

            h = fmodf(h + shift + 360.0f, 360.0f);

            // HSV to RGB
            float c = v * s;
            float x_val = c * (1 - fabsf(fmodf(h / 60.0f, 2.0f) - 1));
            float m = v - c;

            float r2, g2, b2;
            if (h < 60) {{ r2 = c; g2 = x_val; b2 = 0; }}
            else if (h < 120) {{ r2 = x_val; g2 = c; b2 = 0; }}
            else if (h < 180) {{ r2 = 0; g2 = c; b2 = x_val; }}
            else if (h < 240) {{ r2 = 0; g2 = x_val; b2 = c; }}
            else if (h < 300) {{ r2 = x_val; g2 = 0; b2 = c; }}
            else {{ r2 = c; g2 = 0; b2 = x_val; }}

            r = (r2 + m) * 255.0f;
            g = (g2 + m) * 255.0f;
            b = (b2 + m) * 255.0f;
        }}
    }}
'''
        elif op == 'brightness':
            code += f'''
    // Brightness {i}
    {{
        float factor = params[param_idx++];
        r *= factor;
        g *= factor;
        b *= factor;
    }}
'''

    # Write output
    code += '''
    // Write output
    int dst_idx = (y * width + x) * 3;
    dst[dst_idx] = (unsigned char)fminf(255.0f, fmaxf(0.0f, r));
    dst[dst_idx + 1] = (unsigned char)fminf(255.0f, fmaxf(0.0f, g));
    dst[dst_idx + 2] = (unsigned char)fminf(255.0f, fmaxf(0.0f, b));
}
'''

    return code


def _build_params(effects: List[dict], dynamic_params: dict) -> cp.ndarray:
    """Build parameter array for kernel."""
    params = []

    for effect in effects:
        op = effect['op']

        if op == 'rotate':
            params.append(float(dynamic_params.get('rotate_angle', effect.get('angle', 0))))
        elif op == 'zoom':
            params.append(float(dynamic_params.get('zoom_amount', effect.get('amount', 1.0))))
        elif op == 'ripple':
            params.append(float(effect.get('amplitude', 10)))
            params.append(float(effect.get('frequency', 8)))
            params.append(float(effect.get('decay', 2)))
            params.append(float(dynamic_params.get('ripple_phase', effect.get('phase', 0))))
            params.append(float(effect.get('center_x', 960)))
            params.append(float(effect.get('center_y', 540)))
        elif op == 'invert':
            params.append(float(effect.get('amount', 0)))
        elif op == 'hue_shift':
            params.append(float(effect.get('degrees', 0)))
        elif op == 'brightness':
            params.append(float(effect.get('factor', 1.0)))

    return cp.array(params, dtype=cp.float32)


def compile_autonomous_pipeline(effects: List[dict], width: int, height: int,
                                 dynamic_expressions: dict = None) -> callable:
    """
    Compile a fully autonomous pipeline that computes ALL parameters on GPU.

    This eliminates Python from the hot path - the kernel computes time-based
    parameters (sin, cos, etc.) directly on GPU.

    Args:
        effects: List of effect dicts
        width, height: Frame dimensions
        dynamic_expressions: Dict mapping param names to expressions, e.g.:
            {'rotate_angle': 't * 30',
             'ripple_phase': 't * 2',
             'brightness_factor': '0.8 + 0.4 * sin(t * 2)'}

    Returns:
        Callable that takes (frame, frame_num, fps) and returns output frame
    """
    if dynamic_expressions is None:
        dynamic_expressions = {}

    # Generate cache key
    ops_key = str([(e['op'], {k:v for k,v in e.items() if k != 'src2'}) for e in effects])
    expr_key = str(sorted(dynamic_expressions.items()))
    cache_key = f"auto_{width}x{height}_{hashlib.md5((ops_key + expr_key).encode()).hexdigest()}"

    if cache_key in _COMPILED_KERNELS:
        return _COMPILED_KERNELS[cache_key]

    # Generate autonomous kernel code
    kernel_code = _generate_autonomous_kernel(effects, width, height, dynamic_expressions)

    # Compile kernel
    kernel = cp.RawKernel(kernel_code, 'autonomous_pipeline')

    # Create wrapper function
    def run_autonomous(frame: cp.ndarray, frame_num: int, fps: float = 30.0) -> cp.ndarray:
        """Run the autonomous pipeline - no Python in the hot path!"""
        if frame.dtype != cp.uint8:
            frame = cp.clip(frame, 0, 255).astype(cp.uint8)
        if not frame.flags['C_CONTIGUOUS']:
            frame = cp.ascontiguousarray(frame)

        output = cp.zeros_like(frame)

        block = (16, 16)
        grid = ((width + 15) // 16, (height + 15) // 16)

        # Only pass frame_num and fps - kernel computes everything else!
        t = float(frame_num) / float(fps)
        kernel(grid, block, (frame, output, np.int32(width), np.int32(height),
                            np.float32(t), np.int32(frame_num)))

        return output

    _COMPILED_KERNELS[cache_key] = run_autonomous
    return run_autonomous


def _generate_autonomous_kernel(effects: List[dict], width: int, height: int,
                                 dynamic_expressions: dict) -> str:
    """Generate CUDA kernel that computes everything autonomously."""

    # Map simple expressions to CUDA code
    def expr_to_cuda(expr: str) -> str:
        """Convert simple expression to CUDA."""
        expr = expr.replace('sin(', 'sinf(')
        expr = expr.replace('cos(', 'cosf(')
        expr = expr.replace('abs(', 'fabsf(')
        return expr

    code = r'''
extern "C" __global__
void autonomous_pipeline(
    const unsigned char* src,
    unsigned char* dst,
    int width, int height,
    float t, int frame_num
) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= width || y >= height) return;

    // Compute dynamic parameters from time (ALL ON GPU!)
'''

    # Add dynamic parameter calculations
    rotate_expr = dynamic_expressions.get('rotate_angle', '0.0f')
    ripple_phase_expr = dynamic_expressions.get('ripple_phase', '0.0f')
    brightness_expr = dynamic_expressions.get('brightness_factor', '1.0f')
    zoom_expr = dynamic_expressions.get('zoom_amount', '1.0f')

    code += f'''
    float rotate_angle = {expr_to_cuda(rotate_expr)};
    float ripple_phase = {expr_to_cuda(ripple_phase_expr)};
    float brightness_factor = {expr_to_cuda(brightness_expr)};
    float zoom_amount = {expr_to_cuda(zoom_expr)};

    // Start with source coordinates
    float src_x = (float)x;
    float src_y = (float)y;
    float cx = width / 2.0f;
    float cy = height / 2.0f;

    // Accumulated transforms
    float total_cos = 1.0f, total_sin = 0.0f;
    float total_zoom = 1.0f;
    float ripple_dx = 0.0f, ripple_dy = 0.0f;

'''

    # Add effect-specific code
    for i, effect in enumerate(effects):
        op = effect['op']

        if op == 'rotate':
            code += f'''
    // Rotate {i}
    {{
        float angle = rotate_angle * 3.14159265f / 180.0f;
        float c = cosf(angle);
        float s = sinf(angle);
        float nc = total_cos * c - total_sin * s;
        float ns = total_cos * s + total_sin * c;
        total_cos = nc;
        total_sin = ns;
    }}
'''
        elif op == 'zoom':
            code += f'''
    // Zoom {i}
    {{
        total_zoom *= zoom_amount;
    }}
'''
        elif op == 'ripple':
            amp = float(effect.get('amplitude', 10))
            freq = float(effect.get('frequency', 8))
            decay = float(effect.get('decay', 2))
            rcx = float(effect.get('center_x', width/2))
            rcy = float(effect.get('center_y', height/2))
            code += f'''
    // Ripple {i}
    {{
        float amplitude = {amp:.1f}f;
        float frequency = {freq:.1f}f;
        float decay_val = {decay:.1f}f;
        float rcx = {rcx:.1f}f;
        float rcy = {rcy:.1f}f;

        float rdx = src_x - rcx;
        float rdy = src_y - rcy;
        float dist = sqrtf(rdx * rdx + rdy * rdy);

        float wave = sinf(dist * frequency * 0.1f + ripple_phase);
        float amp = amplitude * expf(-dist * decay_val * 0.01f);

        if (dist > 0.001f) {{
            ripple_dx += rdx / dist * wave * amp;
            ripple_dy += rdy / dist * wave * amp;
        }}
    }}
'''

    # Apply geometric transforms
    code += '''
    // Apply accumulated transforms
    {
        float dx = src_x - cx;
        float dy = src_y - cy;
        float rx = total_cos * dx + total_sin * dy;
        float ry = -total_sin * dx + total_cos * dy;
        rx /= total_zoom;
        ry /= total_zoom;
        src_x = rx + cx - ripple_dx;
        src_y = ry + cy - ripple_dy;
    }

    // Bilinear sample
    float r, g, b;
    if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) {
        r = g = b = 0;
    } else {
        int x0 = (int)src_x;
        int y0 = (int)src_y;
        float fx = src_x - x0;
        float fy = src_y - y0;

        int idx00 = (y0 * width + x0) * 3;
        int idx10 = (y0 * width + x0 + 1) * 3;
        int idx01 = ((y0 + 1) * width + x0) * 3;
        int idx11 = ((y0 + 1) * width + x0 + 1) * 3;

        #define BILERP(c) \\
            (src[idx00 + c] * (1-fx) * (1-fy) + \\
             src[idx10 + c] * fx * (1-fy) + \\
             src[idx01 + c] * (1-fx) * fy + \\
             src[idx11 + c] * fx * fy)

        r = BILERP(0);
        g = BILERP(1);
        b = BILERP(2);
    }

'''

    # Add color transforms
    for i, effect in enumerate(effects):
        op = effect['op']

        if op == 'hue_shift':
            degrees = float(effect.get('degrees', 0))
            code += f'''
    // Hue shift {i}
    {{
        float shift = {degrees:.1f}f;
        float rf = r / 255.0f;
        float gf = g / 255.0f;
        float bf = b / 255.0f;

        float max_c = fmaxf(rf, fmaxf(gf, bf));
        float min_c = fminf(rf, fminf(gf, bf));
        float delta = max_c - min_c;

        float h = 0, s = 0, v = max_c;

        if (delta > 0.00001f) {{
            s = delta / max_c;
            if (rf >= max_c) h = (gf - bf) / delta;
            else if (gf >= max_c) h = 2.0f + (bf - rf) / delta;
            else h = 4.0f + (rf - gf) / delta;
            h *= 60.0f;
            if (h < 0) h += 360.0f;
        }}

        h = fmodf(h + shift + 360.0f, 360.0f);

        float c = v * s;
        float x_val = c * (1 - fabsf(fmodf(h / 60.0f, 2.0f) - 1));
        float m = v - c;

        float r2, g2, b2;
        if (h < 60) {{ r2 = c; g2 = x_val; b2 = 0; }}
        else if (h < 120) {{ r2 = x_val; g2 = c; b2 = 0; }}
        else if (h < 180) {{ r2 = 0; g2 = c; b2 = x_val; }}
        else if (h < 240) {{ r2 = 0; g2 = x_val; b2 = c; }}
        else if (h < 300) {{ r2 = x_val; g2 = 0; b2 = c; }}
        else {{ r2 = c; g2 = 0; b2 = x_val; }}

        r = (r2 + m) * 255.0f;
        g = (g2 + m) * 255.0f;
        b = (b2 + m) * 255.0f;
    }}
'''
        elif op == 'brightness':
            code += '''
    // Brightness
    {
        r *= brightness_factor;
        g *= brightness_factor;
        b *= brightness_factor;
    }
'''

    # Write output
    code += '''
    // Write output
    int dst_idx = (y * width + x) * 3;
    dst[dst_idx] = (unsigned char)fminf(255.0f, fmaxf(0.0f, r));
    dst[dst_idx + 1] = (unsigned char)fminf(255.0f, fmaxf(0.0f, g));
    dst[dst_idx + 2] = (unsigned char)fminf(255.0f, fmaxf(0.0f, b));
}
'''

    return code


# Test the compiler
if __name__ == '__main__':
    import time

    print("[sexp_to_cuda] Testing fused kernel compiler...")
    print("=" * 60)

    # Define a test pipeline
    effects = [
        {'op': 'rotate', 'angle': 45.0},
        {'op': 'hue_shift', 'degrees': 30.0},
        {'op': 'ripple', 'amplitude': 15, 'frequency': 10, 'decay': 2, 'phase': 0, 'center_x': 960, 'center_y': 540},
        {'op': 'brightness', 'factor': 1.0},
    ]

    frame = cp.random.randint(0, 255, (1080, 1920, 3), dtype=cp.uint8)

    # ===== Test 1: Standard fused kernel (params passed from Python) =====
    print("\n[Test 1] Standard fused kernel (Python computes params)")
    pipeline = compile_frame_pipeline(effects, 1920, 1080)

    # Warmup
    output = pipeline(frame)
    cp.cuda.Stream.null.synchronize()

    # Benchmark with Python param computation
    start = time.time()
    for i in range(100):
        # Simulate Python computing params (like sexp interpreter does)
        import math
        t = i / 30.0
        angle = t * 30
        phase = t * 2
        brightness = 0.8 + 0.4 * math.sin(t * 2)
        output = pipeline(frame, rotate_angle=angle, ripple_phase=phase)
    cp.cuda.Stream.null.synchronize()
    elapsed = time.time() - start

    print(f"  Time: {elapsed/100*1000:.2f}ms per frame")
    print(f"  FPS:  {100/elapsed:.0f}")

    # ===== Test 2: Autonomous kernel (GPU computes everything) =====
    print("\n[Test 2] Autonomous kernel (GPU computes ALL params)")

    dynamic_expressions = {
        'rotate_angle': 't * 30.0f',
        'ripple_phase': 't * 2.0f',
        'brightness_factor': '0.8f + 0.4f * sinf(t * 2.0f)',
    }

    auto_pipeline = compile_autonomous_pipeline(effects, 1920, 1080, dynamic_expressions)

    # Warmup
    output = auto_pipeline(frame, 0, 30.0)
    cp.cuda.Stream.null.synchronize()

    # Benchmark - NO Python computation in loop!
    start = time.time()
    for i in range(100):
        output = auto_pipeline(frame, i, 30.0)  # Just pass frame_num!
    cp.cuda.Stream.null.synchronize()
    elapsed = time.time() - start

    print(f"  Time: {elapsed/100*1000:.2f}ms per frame")
    print(f"  FPS:  {100/elapsed:.0f}")

    print("\n" + "=" * 60)
    print("Autonomous kernel eliminates Python from hot path!")