#!/usr/bin/env python3
"""
Compare multiple analysis CSV files to determine which most closely resembles a reference file.
Uses Euclidean distance, correlation, and RMSE metrics.
"""

import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from pathlib import Path

def calculate_euclidean_distance(df1, df2, numeric_cols):
    """Calculate Euclidean distance between two dataframes."""
    distances = []
    for idx in df1.index:
        diff = df1.loc[idx, numeric_cols] - df2.loc[idx, numeric_cols]
        # Use nansum to ignore NaN values
        distance = np.sqrt(np.nansum(diff ** 2))
        distances.append(distance)
    return np.array(distances)

def calculate_rmse(df1, df2, numeric_cols):
    """Calculate Root Mean Squared Error."""
    diff = df1[numeric_cols] - df2[numeric_cols]
    # Use nanmean to ignore NaN values
    mse = np.nanmean(diff.values ** 2)
    return np.sqrt(mse)

def calculate_correlation(df1, df2, numeric_cols):
    """Calculate Pearson correlation across all numeric values."""
    vals1 = df1[numeric_cols].values.flatten()
    vals2 = df2[numeric_cols].values.flatten()

    # Remove NaN values - only use positions where both have valid values
    mask = ~(np.isnan(vals1) | np.isnan(vals2))
    vals1_clean = vals1[mask]
    vals2_clean = vals2[mask]

    if len(vals1_clean) < 2:
        return np.nan, np.nan

    corr, pvalue = pearsonr(vals1_clean, vals2_clean)
    return corr, pvalue

def compare_analyses(reference_file, comparison_files):
    """Compare multiple analysis files to a reference file."""

    # Read reference file
    print(f"Reading reference file: {reference_file}")
    ref_df = pd.read_csv(reference_file, quotechar='"', escapechar='\\', engine='python')
    # Get numeric columns (all the rating dimensions)
    numeric_cols = [col for col in ref_df.columns if
                   col.startswith(('Design_', 'Entanglement_', 'Experience_'))]

    # Convert numeric columns to numeric type, coercing errors to NaN
    for col in numeric_cols:
        ref_df[col] = pd.to_numeric(ref_df[col], errors='coerce')

    print(f"\nFound {len(numeric_cols)} numeric dimensions to compare")
    print(f"Comparing {len(ref_df)} protocols\n")
    print("="*80)

    results = {}

    for comp_file in comparison_files:
        print(f"\nComparing: {Path(comp_file).name}")
        print("-"*80)

        # Read comparison file
        comp_df = pd.read_csv(comp_file, quotechar='"', escapechar='\\', engine='python')

        # Convert numeric columns to numeric type, coercing errors to NaN
        for col in numeric_cols:
            comp_df[col] = pd.to_numeric(comp_df[col], errors='coerce')

        # Ensure same protocols in same order (match by Descriptor)
        if 'Descriptor' in ref_df.columns and 'Descriptor' in comp_df.columns:
            # Use merge to ensure exact matching - only keep protocols in ref_df
            comp_df = pd.merge(
                ref_df[['Descriptor']],
                comp_df,
                on='Descriptor',
                how='left'
            )

        # Calculate Euclidean distances using reset indices to ensure alignment
        ref_temp = ref_df.reset_index(drop=True)
        comp_temp = comp_df.reset_index(drop=True)
        euclidean_distances = calculate_euclidean_distance(ref_temp, comp_temp, numeric_cols)
        total_euclidean = np.sum(euclidean_distances)
        avg_euclidean = np.mean(euclidean_distances)

        # Calculate RMSE
        rmse = calculate_rmse(ref_temp, comp_temp, numeric_cols)

        # Calculate correlation
        correlation, p_value = calculate_correlation(ref_temp, comp_temp, numeric_cols)

        # Store results
        results[Path(comp_file).name] = {
            'total_euclidean': total_euclidean,
            'avg_euclidean': avg_euclidean,
            'rmse': rmse,
            'correlation': correlation,
            'p_value': p_value,
            'per_protocol_distances': euclidean_distances,
            'protocols': ref_df['Descriptor'].values if 'Descriptor' in ref_df.columns else None
        }

        # Print results
        print(f"  Total Euclidean Distance:   {total_euclidean:.2f}")
        print(f"  Average Euclidean Distance: {avg_euclidean:.2f}")
        print(f"  RMSE:                       {rmse:.2f}")
        print(f"  Pearson Correlation:        {correlation:.4f} (p={p_value:.2e})")

    # Summary comparison
    print("\n" + "="*80)
    print("SUMMARY RANKING (lower distance = more similar)")
    print("="*80)

    # Sort by average Euclidean distance
    sorted_by_euclidean = sorted(results.items(), key=lambda x: x[1]['avg_euclidean'])

    print("\nBy Average Euclidean Distance:")
    for i, (name, data) in enumerate(sorted_by_euclidean, 1):
        print(f"  {i}. {name:30s} - Avg Distance: {data['avg_euclidean']:.2f}")

    # Sort by correlation (higher is better)
    sorted_by_corr = sorted(results.items(), key=lambda x: x[1]['correlation'], reverse=True)

    print("\nBy Correlation (higher = more similar):")
    for i, (name, data) in enumerate(sorted_by_corr, 1):
        print(f"  {i}. {name:30s} - Correlation: {data['correlation']:.4f}")

    # Sort by RMSE
    sorted_by_rmse = sorted(results.items(), key=lambda x: x[1]['rmse'])

    print("\nBy RMSE (lower = more similar):")
    for i, (name, data) in enumerate(sorted_by_rmse, 1):
        print(f"  {i}. {name:30s} - RMSE: {data['rmse']:.2f}")

    # Show protocols with largest differences for the best match
    print("\n" + "="*80)
    best_match_name, best_match_data = sorted_by_euclidean[0]
    print(f"Top 10 protocols with largest differences from {best_match_name}:")
    print("="*80)

    if best_match_data['protocols'] is not None:
        distances = best_match_data['per_protocol_distances']
        protocols = best_match_data['protocols']
        top_diff_indices = np.argsort(distances)[-10:][::-1]

        for idx in top_diff_indices:
            print(f"  {protocols[idx]:50s} - Distance: {distances[idx]:.2f}")

    return results

if __name__ == "__main__":
    # Define file paths
    reference_file = "analysis_output_manual.csv"
    comparison_files = [
        "analysis_output_gemma3-12b.csv",
        "analysis_output_gpt-oss.csv",
        "analysis_output_mistral.csv"
    ]

    # Check if files exist
    if not Path(reference_file).exists():
        print(f"Error: Reference file '{reference_file}' not found")
        exit(1)

    for file in comparison_files:
        if not Path(file).exists():
            print(f"Warning: Comparison file '{file}' not found, skipping...")
            comparison_files.remove(file)

    if not comparison_files:
        print("Error: No comparison files found")
        exit(1)

    # Run comparison
    results = compare_analyses(reference_file, comparison_files)

    print("\n" + "="*80)
    print("Analysis complete!")
    print("="*80)