#!/usr/bin/env python3 """ Compare multiple analysis CSV files to determine which most closely resembles a reference file. Uses Euclidean distance, correlation, and RMSE metrics. """ import pandas as pd import numpy as np from scipy.stats import pearsonr from pathlib import Path def calculate_euclidean_distance(df1, df2, numeric_cols): """Calculate Euclidean distance between two dataframes.""" distances = [] for idx in df1.index: diff = df1.loc[idx, numeric_cols] - df2.loc[idx, numeric_cols] # Use nansum to ignore NaN values distance = np.sqrt(np.nansum(diff ** 2)) distances.append(distance) return np.array(distances) def calculate_rmse(df1, df2, numeric_cols): """Calculate Root Mean Squared Error.""" diff = df1[numeric_cols] - df2[numeric_cols] # Use nanmean to ignore NaN values mse = np.nanmean(diff.values ** 2) return np.sqrt(mse) def calculate_correlation(df1, df2, numeric_cols): """Calculate Pearson correlation across all numeric values.""" vals1 = df1[numeric_cols].values.flatten() vals2 = df2[numeric_cols].values.flatten() # Remove NaN values - only use positions where both have valid values mask = ~(np.isnan(vals1) | np.isnan(vals2)) vals1_clean = vals1[mask] vals2_clean = vals2[mask] if len(vals1_clean) < 2: return np.nan, np.nan corr, pvalue = pearsonr(vals1_clean, vals2_clean) return corr, pvalue def compare_analyses(reference_file, comparison_files): """Compare multiple analysis files to a reference file.""" # Read reference file print(f"Reading reference file: {reference_file}") ref_df = pd.read_csv(reference_file, quotechar='"', escapechar='\\', engine='python') # Get numeric columns (all the rating dimensions) numeric_cols = [col for col in ref_df.columns if col.startswith(('Design_', 'Entanglement_', 'Experience_'))] # Convert numeric columns to numeric type, coercing errors to NaN for col in numeric_cols: ref_df[col] = pd.to_numeric(ref_df[col], errors='coerce') print(f"\nFound {len(numeric_cols)} numeric dimensions to compare") print(f"Comparing {len(ref_df)} protocols\n") print("="*80) results = {} for comp_file in comparison_files: print(f"\nComparing: {Path(comp_file).name}") print("-"*80) # Read comparison file comp_df = pd.read_csv(comp_file, quotechar='"', escapechar='\\', engine='python') # Convert numeric columns to numeric type, coercing errors to NaN for col in numeric_cols: comp_df[col] = pd.to_numeric(comp_df[col], errors='coerce') # Ensure same protocols in same order (match by Descriptor) if 'Descriptor' in ref_df.columns and 'Descriptor' in comp_df.columns: # Use merge to ensure exact matching - only keep protocols in ref_df comp_df = pd.merge( ref_df[['Descriptor']], comp_df, on='Descriptor', how='left' ) # Calculate Euclidean distances using reset indices to ensure alignment ref_temp = ref_df.reset_index(drop=True) comp_temp = comp_df.reset_index(drop=True) euclidean_distances = calculate_euclidean_distance(ref_temp, comp_temp, numeric_cols) total_euclidean = np.sum(euclidean_distances) avg_euclidean = np.mean(euclidean_distances) # Calculate RMSE rmse = calculate_rmse(ref_temp, comp_temp, numeric_cols) # Calculate correlation correlation, p_value = calculate_correlation(ref_temp, comp_temp, numeric_cols) # Store results results[Path(comp_file).name] = { 'total_euclidean': total_euclidean, 'avg_euclidean': avg_euclidean, 'rmse': rmse, 'correlation': correlation, 'p_value': p_value, 'per_protocol_distances': euclidean_distances, 'protocols': ref_df['Descriptor'].values if 'Descriptor' in ref_df.columns else None } # Print results print(f" Total Euclidean Distance: {total_euclidean:.2f}") print(f" Average Euclidean Distance: {avg_euclidean:.2f}") print(f" RMSE: {rmse:.2f}") print(f" Pearson Correlation: {correlation:.4f} (p={p_value:.2e})") # Summary comparison print("\n" + "="*80) print("SUMMARY RANKING (lower distance = more similar)") print("="*80) # Sort by average Euclidean distance sorted_by_euclidean = sorted(results.items(), key=lambda x: x[1]['avg_euclidean']) print("\nBy Average Euclidean Distance:") for i, (name, data) in enumerate(sorted_by_euclidean, 1): print(f" {i}. {name:30s} - Avg Distance: {data['avg_euclidean']:.2f}") # Sort by correlation (higher is better) sorted_by_corr = sorted(results.items(), key=lambda x: x[1]['correlation'], reverse=True) print("\nBy Correlation (higher = more similar):") for i, (name, data) in enumerate(sorted_by_corr, 1): print(f" {i}. {name:30s} - Correlation: {data['correlation']:.4f}") # Sort by RMSE sorted_by_rmse = sorted(results.items(), key=lambda x: x[1]['rmse']) print("\nBy RMSE (lower = more similar):") for i, (name, data) in enumerate(sorted_by_rmse, 1): print(f" {i}. {name:30s} - RMSE: {data['rmse']:.2f}") # Show protocols with largest differences for the best match print("\n" + "="*80) best_match_name, best_match_data = sorted_by_euclidean[0] print(f"Top 10 protocols with largest differences from {best_match_name}:") print("="*80) if best_match_data['protocols'] is not None: distances = best_match_data['per_protocol_distances'] protocols = best_match_data['protocols'] top_diff_indices = np.argsort(distances)[-10:][::-1] for idx in top_diff_indices: print(f" {protocols[idx]:50s} - Distance: {distances[idx]:.2f}") return results if __name__ == "__main__": # Define file paths reference_file = "analysis_output_manual.csv" comparison_files = [ "analysis_output_gemma3-12b.csv", "analysis_output_gpt-oss.csv", "analysis_output_mistral.csv" ] # Check if files exist if not Path(reference_file).exists(): print(f"Error: Reference file '{reference_file}' not found") exit(1) for file in comparison_files: if not Path(file).exists(): print(f"Warning: Comparison file '{file}' not found, skipping...") comparison_files.remove(file) if not comparison_files: print("Error: No comparison files found") exit(1) # Run comparison results = compare_analyses(reference_file, comparison_files) print("\n" + "="*80) print("Analysis complete!") print("="*80)