"""
Trainable Quantum Kernel — Kernel-Target Alignment Optimization
================================================================
Extends quantum kernels with **learnable parameters** that are optimized
via gradient descent to maximize kernel-target alignment (KTA) — a
differentiable proxy for classification accuracy.

Standard quantum kernels use a fixed feature map:

    K(x, y) = |⟨φ(x)|φ(y)⟩|²

The trainable kernel introduces parameters θ into the feature map:

    K_θ(x, y) = |⟨φ_θ(x)|φ_θ(y)⟩|²

and optimizes θ to maximize how well the kernel matrix K aligns with
the label structure yy^T of a given classification task.

Training objective — Kernel-Target Alignment (KTA):

    KTA(K, y) = ⟨K, yy^T⟩_F / (‖K‖_F · ‖yy^T‖_F)

where ⟨·,·⟩_F is the Frobenius inner product. KTA ∈ [-1, 1]:
  KTA ≈ 1  → kernel perfectly separates the classes
  KTA ≈ 0  → kernel is uncorrelated with the task
  KTA < 0  → kernel anti-correlates with the task

The gradient ∂KTA/∂θ is computed via PennyLane's automatic
differentiation through the quantum circuit, enabling standard
gradient descent on the kernel parameters.

Category: Machine Learning
Difficulty: Advanced
Framework: PennyLane
Qubits: 4
Depth: ~12
Gates: RY, RZ, CNOT

Feature map (per repetition):
    Data layer:     RY(x_i · π) on each qubit (angle encoding)
    Trainable layer: RZ(θ_k) RY(θ_{k+1}) on each qubit
    Entangling layer: CNOT chain (linear connectivity)

References:
    - Hubregtsen, T. et al. (2022). "Training quantum embedding kernels
      on near-term quantum devices." Physical Review A 106, 042431.
      DOI: 10.1103/PhysRevA.106.042431
    - Glick, J.R. et al. (2024). "Covariant quantum kernels for data
      with group structure." Nature Physics 20, 1027-1036.
      DOI: 10.1038/s41567-023-02288-w
    - Cristianini, N. et al. (2001). "On Kernel-Target Alignment."
      NeurIPS 14.
"""

import numpy as np

try:
    import pennylane as qml
    from pennylane import numpy as pnp

    PENNYLANE_AVAILABLE = True
except ImportError:
    PENNYLANE_AVAILABLE = False


# ==========================================================================
# Constants
# ==========================================================================

DEFAULT_N_QUBITS = 4
DEFAULT_N_REPS = 2
DEFAULT_N_SAMPLES = 8
DEFAULT_N_FEATURES = 2
DEFAULT_N_ITERATIONS = 50
DEFAULT_LEARNING_RATE = 0.1
SEED = 42

# Data generation parameters
CLASS_SPREAD = 0.3        # Standard deviation for each class cluster
CLASS_0_CENTER = 0.3      # Center of class -1 in feature space
CLASS_1_CENTER = 0.7      # Center of class +1 in feature space


# ==========================================================================
# Feature map
# ==========================================================================

def create_trainable_feature_map(n_qubits: int = DEFAULT_N_QUBITS):
    """Create a parameterized feature map that interleaves data encoding
    with trainable rotations.

    The feature map applies `reps` repetitions of three layers:

    1. **Data encoding**: RY(x_i * pi) on each qubit — angle encoding that
       maps feature values to Bloch sphere rotations.
    2. **Trainable rotations**: RZ(theta_k) RY(theta_{k+1}) on each qubit —
       learnable single-qubit gates that adapt the embedding to the task.
    3. **Entanglement**: CNOT chain on adjacent qubits — creates correlations
       between encoded features for non-linear expressivity.

    Parameter count: n_reps * n_qubits * 2 (RZ + RY per qubit per rep).

    Args:
        n_qubits: Number of qubits in the feature map circuit.

    Returns:
        A function ``feature_map(x, params, reps)`` that applies the
        parameterized encoding to the current PennyLane context.
    """
    if not PENNYLANE_AVAILABLE:
        raise ImportError("PennyLane required: pip install pennylane")

    def feature_map(x, params, reps=DEFAULT_N_REPS):
        """Apply the trainable feature map circuit.

        Args:
            x: Input data vector of shape (n_features,).
            params: Flat array of trainable parameters,
                    length = reps * n_qubits * 2.
            reps: Number of encoding repetitions (circuit depth).
        """
        n_features = len(x)
        param_idx = 0

        for _rep in range(reps):
            # --- Data encoding layer ---
            # Angle encoding: map each feature to a Bloch sphere rotation
            for i in range(n_qubits):
                qml.RY(x[i % n_features] * np.pi, wires=i)

            # --- Trainable rotation layer ---
            # Two free parameters per qubit: RZ controls phase, RY controls
            # amplitude — together they span a full single-qubit subspace
            for i in range(n_qubits):
                qml.RZ(params[param_idx], wires=i)
                param_idx += 1
                qml.RY(params[param_idx], wires=i)
                param_idx += 1

            # --- Entangling layer ---
            # Linear CNOT chain creates nearest-neighbour correlations
            for i in range(n_qubits - 1):
                qml.CNOT(wires=[i, i + 1])

    return feature_map


# ==========================================================================
# Trainable kernel
# ==========================================================================

def create_trainable_kernel(
    n_qubits: int = DEFAULT_N_QUBITS,
    n_reps: int = DEFAULT_N_REPS,
):
    """Create a trainable kernel function using the inversion test.

    The kernel value is computed as:
        K_θ(x, y) = |⟨0|φ_θ†(y) φ_θ(x)|0⟩|²

    This is the probability of measuring the all-zeros state after
    applying the feature map for x, then the adjoint feature map for y.
    When x = y, the adjoint exactly cancels the forward map, giving
    K_θ(x, x) = 1.

    Args:
        n_qubits: Number of qubits in the kernel circuit.
        n_reps: Number of feature map repetitions (depth control).

    Returns:
        A function ``kernel(x, y, params) -> float`` that computes the
        kernel value for data points x and y given parameters params.
    """
    if not PENNYLANE_AVAILABLE:
        raise ImportError("PennyLane required: pip install pennylane")

    dev = qml.device("default.qubit", wires=n_qubits)
    feature_map = create_trainable_feature_map(n_qubits)

    @qml.qnode(dev, interface="autograd")
    def kernel_circuit(x, y, params):
        # Forward encoding: apply φ_θ(x)
        feature_map(x, params, n_reps)

        # Inverse encoding: apply φ_θ†(y)
        # The adjoint reverses gate order and negates rotation angles
        qml.adjoint(lambda: feature_map(y, params, n_reps))()

        return qml.probs(wires=range(n_qubits))

    def kernel(x, y, params):
        """Compute kernel value K_θ(x, y).

        Args:
            x: First data point, shape (n_features,).
            y: Second data point, shape (n_features,).
            params: Trainable parameters for the feature map.

        Returns:
            Kernel value in [0, 1]. Equal to 1 when x == y.
        """
        probs = kernel_circuit(x, y, params)
        # Probability of all-zeros state = |⟨0|U†(y)U(x)|0⟩|²
        return probs[0]

    return kernel


# ==========================================================================
# Kernel-target alignment
# ==========================================================================

def kernel_target_alignment(K, y):
    """Compute kernel-target alignment (KTA).

    KTA measures how well a kernel matrix K matches the ideal kernel
    yy^T for a binary classification task. It is the cosine similarity
    between K and yy^T in Frobenius space:

        KTA(K, y) = ⟨K, yy^T⟩_F / (‖K‖_F · ‖yy^T‖_F)

    where ⟨A, B⟩_F = Σ_ij A_ij B_ij = tr(A^T B).

    KTA is differentiable with respect to K (and thus θ), making it a
    suitable training objective for gradient-based optimization.

    Uses pnp operations so PennyLane's autograd can trace through
    during optimization (standard numpy would break the gradient tape).

    Args:
        K: Kernel matrix of shape (n, n).
        y: Label vector of shape (n,) with values in {-1, +1}.

    Returns:
        KTA value in [-1, 1]. Higher is better.
    """
    # Ideal kernel: same-class pairs → +1, different-class → -1
    yy = pnp.outer(y, y) if PENNYLANE_AVAILABLE else np.outer(y, y)

    # Frobenius norms — autograd-safe (no np.linalg.norm)
    k_fro = pnp.sqrt(pnp.sum(K * K))
    yy_fro = pnp.sqrt(pnp.sum(yy * yy))

    # KTA = Frobenius inner product / product of norms
    alignment = pnp.sum(K * yy) / (k_fro * yy_fro)
    return alignment


# ==========================================================================
# Training loop
# ==========================================================================

def train_kernel(
    X: np.ndarray,
    y: np.ndarray,
    n_qubits: int = DEFAULT_N_QUBITS,
    n_reps: int = DEFAULT_N_REPS,
    n_iterations: int = DEFAULT_N_ITERATIONS,
    learning_rate: float = DEFAULT_LEARNING_RATE,
) -> dict:
    """Train kernel parameters to maximize kernel-target alignment.

    The training loop:
      1. Initialize random parameters θ ~ Uniform(0, 2π)
      2. For each iteration:
         a. Compute kernel matrix K_θ(x_i, x_j) for all pairs
         b. Evaluate KTA(K_θ, y)
         c. Compute ∂(-KTA)/∂θ via PennyLane autograd
         d. Update θ ← θ - η · ∂(-KTA)/∂θ

    The negative KTA is minimized (PennyLane optimizers minimize).

    Args:
        X: Training data of shape (n_samples, n_features).
        y: Binary labels of shape (n_samples,), values in {-1, +1}.
        n_qubits: Number of qubits in the feature map.
        n_reps: Number of feature map repetitions.
        n_iterations: Number of gradient descent steps.
        learning_rate: Step size for the optimizer.

    Returns:
        dict with keys:
            optimal_params: List of optimized parameter values.
            final_kta: KTA after training.
            kta_history: List of KTA values per iteration.
            n_iterations: Number of iterations performed.
    """
    if not PENNYLANE_AVAILABLE:
        raise ImportError("PennyLane required: pip install pennylane")

    kernel_fn = create_trainable_kernel(n_qubits, n_reps)

    # Initialize parameters uniformly in [0, 2π]
    n_params = n_reps * n_qubits * 2
    params = pnp.random.uniform(0, 2 * np.pi, n_params, requires_grad=True)

    optimizer = qml.GradientDescentOptimizer(learning_rate)

    def compute_kta(params):
        """Cost function: negative KTA (for minimization).

        Builds the kernel matrix element-by-element using pnp.stack
        to keep autograd ArrayBox values traceable. (NumPy 2.x rejects
        ArrayBox in item assignment to pre-allocated arrays.)
        """
        n_samples = len(X)
        K_rows = []
        for i in range(n_samples):
            row = [kernel_fn(X[i], X[j], params) for j in range(n_samples)]
            K_rows.append(pnp.stack(row))
        K = pnp.stack(K_rows)
        return -kernel_target_alignment(K, y)  # Negative for minimization

    kta_history = []
    for _iteration in range(n_iterations):
        params, cost = optimizer.step_and_cost(compute_kta, params)
        kta_history.append(-cost)  # Restore positive KTA

    return {
        "optimal_params": params.tolist(),
        "final_kta": kta_history[-1],
        "kta_history": kta_history,
        "n_iterations": n_iterations,
    }


# ==========================================================================
# Execution
# ==========================================================================

def run_circuit(
    n_samples: int = DEFAULT_N_SAMPLES,
    n_features: int = DEFAULT_N_FEATURES,
    n_qubits: int = DEFAULT_N_QUBITS,
    n_iterations: int = 5,
) -> dict:
    """Run trainable kernel demonstration.

    Generates synthetic binary classification data, trains the kernel
    parameters via KTA optimization, and compares the trained kernel
    against a random (untrained) baseline.

    Args:
        n_samples: Total number of data points (split evenly between classes).
        n_features: Dimensionality of each data point.
        n_qubits: Number of qubits in the kernel circuit.
        n_iterations: Number of KTA optimization steps.

    Returns:
        dict with keys:
            n_qubits, n_samples, n_iterations: Configuration.
            random_kta: KTA with random (untrained) parameters.
            trained_kta: KTA after optimization.
            improvement: trained_kta - random_kta.
            optimal_params: List of optimized parameter values.
            framework: "PennyLane".
    """
    if not PENNYLANE_AVAILABLE:
        return {
            "error": "PennyLane not installed",
            "install": "pip install pennylane",
        }

    rng = np.random.default_rng(SEED)

    # Generate binary classification data: two Gaussian clusters
    n_per_class = n_samples // 2
    X0 = rng.standard_normal((n_per_class, n_features)) * CLASS_SPREAD + CLASS_0_CENTER
    X1 = rng.standard_normal((n_per_class, n_features)) * CLASS_SPREAD + CLASS_1_CENTER
    X = np.vstack([X0, X1])
    y = np.array([-1] * n_per_class + [1] * (n_samples - n_per_class))

    # Shuffle to avoid ordering bias
    idx = rng.permutation(len(X))
    X, y = X[idx], y[idx]

    # Train kernel parameters via KTA maximization
    result = train_kernel(X, y, n_qubits, n_iterations=n_iterations)

    # Compare with random (untrained) kernel as baseline
    n_params = DEFAULT_N_REPS * n_qubits * 2
    random_params = rng.uniform(0, 2 * np.pi, n_params)
    kernel_fn = create_trainable_kernel(n_qubits, DEFAULT_N_REPS)

    K_random = np.zeros((len(X), len(X)))
    K_trained = np.zeros((len(X), len(X)))

    for i in range(len(X)):
        for j in range(len(X)):
            K_random[i, j] = kernel_fn(X[i], X[j], random_params)
            K_trained[i, j] = kernel_fn(X[i], X[j], np.array(result["optimal_params"]))

    random_kta = kernel_target_alignment(K_random, y)
    trained_kta = kernel_target_alignment(K_trained, y)

    return {
        "n_qubits": n_qubits,
        "n_samples": n_samples,
        "n_iterations": n_iterations,
        "random_kta": random_kta,
        "trained_kta": trained_kta,
        "improvement": trained_kta - random_kta,
        "optimal_params": result["optimal_params"],
        "framework": "PennyLane",
    }


# ==========================================================================
# Verification
# ==========================================================================

def verify_trainable_kernel(n_samples: int = 8, n_iterations: int = 15) -> dict:
    """Verify the trainable kernel produces valid and improved results.

    Checks:
      1. Trained KTA > random KTA (optimization actually helps)
      2. KTA is in valid range [-1, 1]
      3. Kernel matrix is symmetric (K = K^T)
      4. Kernel diagonal entries are close to 1.0 (self-similarity)

    Args:
        n_samples: Number of test data points.
        n_iterations: Training iterations for the test.

    Returns:
        dict with:
            passed: bool — True if all checks pass.
            checks: list of dicts with name, passed, detail.
    """
    if not PENNYLANE_AVAILABLE:
        return {
            "passed": False,
            "checks": [
                {"name": "import", "passed": False,
                 "detail": "PennyLane not installed"}
            ],
        }

    result = run_circuit(n_samples=n_samples, n_iterations=n_iterations)
    checks = []
    passed = True

    # Check 1: Trained KTA >= random KTA (with small tolerance for noise)
    trained = float(result["trained_kta"])
    random_kta = float(result["random_kta"])
    check1 = {
        "name": "kta_improvement",
        "passed": trained >= random_kta - 0.02,
        "detail": (
            f"Trained KTA ({trained:.4f}) "
            f">= Random KTA ({random_kta:.4f}) - 0.02"
        ),
    }
    checks.append(check1)
    if not check1["passed"]:
        passed = False

    # Check 2: KTA values in valid range [-1, 1]
    trained_in_range = -1.0 <= float(result["trained_kta"]) <= 1.0
    random_in_range = -1.0 <= float(result["random_kta"]) <= 1.0
    check2 = {
        "name": "kta_valid_range",
        "passed": trained_in_range and random_in_range,
        "detail": (
            f"Trained KTA = {float(result['trained_kta']):.4f}, "
            f"Random KTA = {float(result['random_kta']):.4f} "
            f"(expected [-1, 1])"
        ),
    }
    checks.append(check2)
    if not check2["passed"]:
        passed = False

    # Check 3: Kernel matrix symmetry
    # Rebuild the trained kernel matrix to verify properties
    rng = np.random.default_rng(SEED)
    n_per_class = n_samples // 2
    X0 = rng.standard_normal((n_per_class, DEFAULT_N_FEATURES)) * CLASS_SPREAD + CLASS_0_CENTER
    X1 = rng.standard_normal((n_per_class, DEFAULT_N_FEATURES)) * CLASS_SPREAD + CLASS_1_CENTER
    X = np.vstack([X0, X1])
    idx = rng.permutation(len(X))
    X = X[idx]

    kernel_fn = create_trainable_kernel(DEFAULT_N_QUBITS, DEFAULT_N_REPS)
    opt_params = np.array(result["optimal_params"])

    K = np.zeros((len(X), len(X)))
    for i in range(len(X)):
        for j in range(len(X)):
            K[i, j] = kernel_fn(X[i], X[j], opt_params)

    is_symmetric = bool(np.allclose(K, K.T, atol=1e-6))
    check3 = {
        "name": "kernel_symmetric",
        "passed": is_symmetric,
        "detail": f"K = K^T: {is_symmetric}",
    }
    checks.append(check3)
    if not check3["passed"]:
        passed = False

    # Check 4: Diagonal close to 1.0 (self-similarity)
    diag_mean = float(np.mean(np.diag(K)))
    check4 = {
        "name": "unit_diagonal",
        "passed": abs(diag_mean - 1.0) < 0.05,
        "detail": f"Diagonal mean = {diag_mean:.4f} (expected ~1.0)",
    }
    checks.append(check4)
    if not check4["passed"]:
        passed = False

    return {"passed": passed, "checks": checks}


# ==========================================================================
# Main
# ==========================================================================

def create_circuit():
    """Zero-arg entry point for the QubitHub PennyLane runner.

    The trainable kernel's inner QNode takes ``(x, y, params)``. The
    QubitHub runner contract calls a 2-arg ``kernel(x, y)``, so this
    wrapper closes over deterministic pseudo-random params (seed=42)
    and exposes the data-only callable.
    """
    if not PENNYLANE_AVAILABLE:
        raise ImportError("PennyLane required: pip install pennylane")
    inner = create_trainable_kernel(n_qubits=DEFAULT_N_QUBITS, n_reps=DEFAULT_N_REPS)
    rng = np.random.default_rng(seed=42)
    n_params = 3 * DEFAULT_N_QUBITS * DEFAULT_N_REPS
    params = rng.uniform(0.0, 2 * np.pi, size=n_params).tolist()

    def kernel(x, y):
        return inner(x, y, params)

    return kernel


if __name__ == "__main__":
    print("Trainable Quantum Kernel")
    print("=" * 50)

    result = run_circuit(n_samples=12, n_iterations=15)

    if "error" in result:
        print(f"Error: {result['error']}")
    else:
        print(f"\nFramework: {result['framework']}")
        print(f"Qubits: {result['n_qubits']}")
        print(f"Iterations: {result['n_iterations']}")
        print(f"Random KTA: {result['random_kta']:.4f}")
        print(f"Trained KTA: {result['trained_kta']:.4f}")
        print(f"Improvement: {result['improvement']:.4f}")

    print("\nVerification:")
    print("-" * 50)
    v = verify_trainable_kernel()
    for check in v["checks"]:
        symbol = "PASS" if check["passed"] else "FAIL"
        print(f"  [{symbol}] {check['name']}: {check['detail']}")
    print(f"\nOverall: {'PASSED' if v['passed'] else 'FAILED'}")