Initial analysis complete

2025-11-16 23:47:10 -07:00
parent 815ed9d6f4
commit dcfd37fa4c
55 changed files with 7173 additions and 450 deletions
--- a/analysis/compare_analyses.py
+++ b/analysis/compare_analyses.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+"""
+Compare multiple analysis CSV files to determine which most closely resembles a reference file.
+Uses Euclidean distance, correlation, and RMSE metrics.
+"""
+
+import pandas as pd
+import numpy as np
+from scipy.stats import pearsonr
+from pathlib import Path
+
+def calculate_euclidean_distance(df1, df2, numeric_cols):
+    """Calculate Euclidean distance between two dataframes."""
+    distances = []
+    for idx in df1.index:
+        diff = df1.loc[idx, numeric_cols] - df2.loc[idx, numeric_cols]
+        # Use nansum to ignore NaN values
+        distance = np.sqrt(np.nansum(diff ** 2))
+        distances.append(distance)
+    return np.array(distances)
+
+def calculate_rmse(df1, df2, numeric_cols):
+    """Calculate Root Mean Squared Error."""
+    diff = df1[numeric_cols] - df2[numeric_cols]
+    # Use nanmean to ignore NaN values
+    mse = np.nanmean(diff.values ** 2)
+    return np.sqrt(mse)
+
+def calculate_correlation(df1, df2, numeric_cols):
+    """Calculate Pearson correlation across all numeric values."""
+    vals1 = df1[numeric_cols].values.flatten()
+    vals2 = df2[numeric_cols].values.flatten()
+
+    # Remove NaN values - only use positions where both have valid values
+    mask = ~(np.isnan(vals1) | np.isnan(vals2))
+    vals1_clean = vals1[mask]
+    vals2_clean = vals2[mask]
+
+    if len(vals1_clean) < 2:
+        return np.nan, np.nan
+
+    corr, pvalue = pearsonr(vals1_clean, vals2_clean)
+    return corr, pvalue
+
+def compare_analyses(reference_file, comparison_files):
+    """Compare multiple analysis files to a reference file."""
+
+    # Read reference file
+    print(f"Reading reference file: {reference_file}")
+    ref_df = pd.read_csv(reference_file, quotechar='"', escapechar='\\', engine='python')
+    # Get numeric columns (all the rating dimensions)
+    numeric_cols = [col for col in ref_df.columns if
+                   col.startswith(('Design_', 'Entanglement_', 'Experience_'))]
+
+    # Convert numeric columns to numeric type, coercing errors to NaN
+    for col in numeric_cols:
+        ref_df[col] = pd.to_numeric(ref_df[col], errors='coerce')
+
+    print(f"\nFound {len(numeric_cols)} numeric dimensions to compare")
+    print(f"Comparing {len(ref_df)} protocols\n")
+    print("="*80)
+
+    results = {}
+
+    for comp_file in comparison_files:
+        print(f"\nComparing: {Path(comp_file).name}")
+        print("-"*80)
+
+        # Read comparison file
+        comp_df = pd.read_csv(comp_file, quotechar='"', escapechar='\\', engine='python')
+
+        # Convert numeric columns to numeric type, coercing errors to NaN
+        for col in numeric_cols:
+            comp_df[col] = pd.to_numeric(comp_df[col], errors='coerce')
+
+        # Ensure same protocols in same order (match by Descriptor)
+        if 'Descriptor' in ref_df.columns and 'Descriptor' in comp_df.columns:
+            # Use merge to ensure exact matching - only keep protocols in ref_df
+            comp_df = pd.merge(
+                ref_df[['Descriptor']],
+                comp_df,
+                on='Descriptor',
+                how='left'
+            )
+
+        # Calculate Euclidean distances using reset indices to ensure alignment
+        ref_temp = ref_df.reset_index(drop=True)
+        comp_temp = comp_df.reset_index(drop=True)
+        euclidean_distances = calculate_euclidean_distance(ref_temp, comp_temp, numeric_cols)
+        total_euclidean = np.sum(euclidean_distances)
+        avg_euclidean = np.mean(euclidean_distances)
+
+        # Calculate RMSE
+        rmse = calculate_rmse(ref_temp, comp_temp, numeric_cols)
+
+        # Calculate correlation
+        correlation, p_value = calculate_correlation(ref_temp, comp_temp, numeric_cols)
+
+        # Store results
+        results[Path(comp_file).name] = {
+            'total_euclidean': total_euclidean,
+            'avg_euclidean': avg_euclidean,
+            'rmse': rmse,
+            'correlation': correlation,
+            'p_value': p_value,
+            'per_protocol_distances': euclidean_distances,
+            'protocols': ref_df['Descriptor'].values if 'Descriptor' in ref_df.columns else None
+        }
+
+        # Print results
+        print(f"  Total Euclidean Distance:   {total_euclidean:.2f}")
+        print(f"  Average Euclidean Distance: {avg_euclidean:.2f}")
+        print(f"  RMSE:                       {rmse:.2f}")
+        print(f"  Pearson Correlation:        {correlation:.4f} (p={p_value:.2e})")
+
+    # Summary comparison
+    print("\n" + "="*80)
+    print("SUMMARY RANKING (lower distance = more similar)")
+    print("="*80)
+
+    # Sort by average Euclidean distance
+    sorted_by_euclidean = sorted(results.items(), key=lambda x: x[1]['avg_euclidean'])
+
+    print("\nBy Average Euclidean Distance:")
+    for i, (name, data) in enumerate(sorted_by_euclidean, 1):
+        print(f"  {i}. {name:30s} - Avg Distance: {data['avg_euclidean']:.2f}")
+
+    # Sort by correlation (higher is better)
+    sorted_by_corr = sorted(results.items(), key=lambda x: x[1]['correlation'], reverse=True)
+
+    print("\nBy Correlation (higher = more similar):")
+    for i, (name, data) in enumerate(sorted_by_corr, 1):
+        print(f"  {i}. {name:30s} - Correlation: {data['correlation']:.4f}")
+
+    # Sort by RMSE
+    sorted_by_rmse = sorted(results.items(), key=lambda x: x[1]['rmse'])
+
+    print("\nBy RMSE (lower = more similar):")
+    for i, (name, data) in enumerate(sorted_by_rmse, 1):
+        print(f"  {i}. {name:30s} - RMSE: {data['rmse']:.2f}")
+
+    # Show protocols with largest differences for the best match
+    print("\n" + "="*80)
+    best_match_name, best_match_data = sorted_by_euclidean[0]
+    print(f"Top 10 protocols with largest differences from {best_match_name}:")
+    print("="*80)
+
+    if best_match_data['protocols'] is not None:
+        distances = best_match_data['per_protocol_distances']
+        protocols = best_match_data['protocols']
+        top_diff_indices = np.argsort(distances)[-10:][::-1]
+
+        for idx in top_diff_indices:
+            print(f"  {protocols[idx]:50s} - Distance: {distances[idx]:.2f}")
+
+    return results
+
+if __name__ == "__main__":
+    # Define file paths
+    reference_file = "analysis_output_manual.csv"
+    comparison_files = [
+        "analysis_output_gemma3-12b.csv",
+        "analysis_output_gpt-oss.csv",
+        "analysis_output_mistral.csv"
+    ]
+
+    # Check if files exist
+    if not Path(reference_file).exists():
+        print(f"Error: Reference file '{reference_file}' not found")
+        exit(1)
+
+    for file in comparison_files:
+        if not Path(file).exists():
+            print(f"Warning: Comparison file '{file}' not found, skipping...")
+            comparison_files.remove(file)
+
+    if not comparison_files:
+        print("Error: No comparison files found")
+        exit(1)
+
+    # Run comparison
+    results = compare_analyses(reference_file, comparison_files)
+
+    print("\n" + "="*80)
+    print("Analysis complete!")
+    print("="*80)