Initial analysis complete
This commit is contained in:
186
analysis/compare_analyses.py
Normal file
186
analysis/compare_analyses.py
Normal file
@@ -0,0 +1,186 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Compare multiple analysis CSV files to determine which most closely resembles a reference file.
|
||||
Uses Euclidean distance, correlation, and RMSE metrics.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy.stats import pearsonr
|
||||
from pathlib import Path
|
||||
|
||||
def calculate_euclidean_distance(df1, df2, numeric_cols):
|
||||
"""Calculate Euclidean distance between two dataframes."""
|
||||
distances = []
|
||||
for idx in df1.index:
|
||||
diff = df1.loc[idx, numeric_cols] - df2.loc[idx, numeric_cols]
|
||||
# Use nansum to ignore NaN values
|
||||
distance = np.sqrt(np.nansum(diff ** 2))
|
||||
distances.append(distance)
|
||||
return np.array(distances)
|
||||
|
||||
def calculate_rmse(df1, df2, numeric_cols):
|
||||
"""Calculate Root Mean Squared Error."""
|
||||
diff = df1[numeric_cols] - df2[numeric_cols]
|
||||
# Use nanmean to ignore NaN values
|
||||
mse = np.nanmean(diff.values ** 2)
|
||||
return np.sqrt(mse)
|
||||
|
||||
def calculate_correlation(df1, df2, numeric_cols):
|
||||
"""Calculate Pearson correlation across all numeric values."""
|
||||
vals1 = df1[numeric_cols].values.flatten()
|
||||
vals2 = df2[numeric_cols].values.flatten()
|
||||
|
||||
# Remove NaN values - only use positions where both have valid values
|
||||
mask = ~(np.isnan(vals1) | np.isnan(vals2))
|
||||
vals1_clean = vals1[mask]
|
||||
vals2_clean = vals2[mask]
|
||||
|
||||
if len(vals1_clean) < 2:
|
||||
return np.nan, np.nan
|
||||
|
||||
corr, pvalue = pearsonr(vals1_clean, vals2_clean)
|
||||
return corr, pvalue
|
||||
|
||||
def compare_analyses(reference_file, comparison_files):
|
||||
"""Compare multiple analysis files to a reference file."""
|
||||
|
||||
# Read reference file
|
||||
print(f"Reading reference file: {reference_file}")
|
||||
ref_df = pd.read_csv(reference_file, quotechar='"', escapechar='\\', engine='python')
|
||||
# Get numeric columns (all the rating dimensions)
|
||||
numeric_cols = [col for col in ref_df.columns if
|
||||
col.startswith(('Design_', 'Entanglement_', 'Experience_'))]
|
||||
|
||||
# Convert numeric columns to numeric type, coercing errors to NaN
|
||||
for col in numeric_cols:
|
||||
ref_df[col] = pd.to_numeric(ref_df[col], errors='coerce')
|
||||
|
||||
print(f"\nFound {len(numeric_cols)} numeric dimensions to compare")
|
||||
print(f"Comparing {len(ref_df)} protocols\n")
|
||||
print("="*80)
|
||||
|
||||
results = {}
|
||||
|
||||
for comp_file in comparison_files:
|
||||
print(f"\nComparing: {Path(comp_file).name}")
|
||||
print("-"*80)
|
||||
|
||||
# Read comparison file
|
||||
comp_df = pd.read_csv(comp_file, quotechar='"', escapechar='\\', engine='python')
|
||||
|
||||
# Convert numeric columns to numeric type, coercing errors to NaN
|
||||
for col in numeric_cols:
|
||||
comp_df[col] = pd.to_numeric(comp_df[col], errors='coerce')
|
||||
|
||||
# Ensure same protocols in same order (match by Descriptor)
|
||||
if 'Descriptor' in ref_df.columns and 'Descriptor' in comp_df.columns:
|
||||
# Use merge to ensure exact matching - only keep protocols in ref_df
|
||||
comp_df = pd.merge(
|
||||
ref_df[['Descriptor']],
|
||||
comp_df,
|
||||
on='Descriptor',
|
||||
how='left'
|
||||
)
|
||||
|
||||
# Calculate Euclidean distances using reset indices to ensure alignment
|
||||
ref_temp = ref_df.reset_index(drop=True)
|
||||
comp_temp = comp_df.reset_index(drop=True)
|
||||
euclidean_distances = calculate_euclidean_distance(ref_temp, comp_temp, numeric_cols)
|
||||
total_euclidean = np.sum(euclidean_distances)
|
||||
avg_euclidean = np.mean(euclidean_distances)
|
||||
|
||||
# Calculate RMSE
|
||||
rmse = calculate_rmse(ref_temp, comp_temp, numeric_cols)
|
||||
|
||||
# Calculate correlation
|
||||
correlation, p_value = calculate_correlation(ref_temp, comp_temp, numeric_cols)
|
||||
|
||||
# Store results
|
||||
results[Path(comp_file).name] = {
|
||||
'total_euclidean': total_euclidean,
|
||||
'avg_euclidean': avg_euclidean,
|
||||
'rmse': rmse,
|
||||
'correlation': correlation,
|
||||
'p_value': p_value,
|
||||
'per_protocol_distances': euclidean_distances,
|
||||
'protocols': ref_df['Descriptor'].values if 'Descriptor' in ref_df.columns else None
|
||||
}
|
||||
|
||||
# Print results
|
||||
print(f" Total Euclidean Distance: {total_euclidean:.2f}")
|
||||
print(f" Average Euclidean Distance: {avg_euclidean:.2f}")
|
||||
print(f" RMSE: {rmse:.2f}")
|
||||
print(f" Pearson Correlation: {correlation:.4f} (p={p_value:.2e})")
|
||||
|
||||
# Summary comparison
|
||||
print("\n" + "="*80)
|
||||
print("SUMMARY RANKING (lower distance = more similar)")
|
||||
print("="*80)
|
||||
|
||||
# Sort by average Euclidean distance
|
||||
sorted_by_euclidean = sorted(results.items(), key=lambda x: x[1]['avg_euclidean'])
|
||||
|
||||
print("\nBy Average Euclidean Distance:")
|
||||
for i, (name, data) in enumerate(sorted_by_euclidean, 1):
|
||||
print(f" {i}. {name:30s} - Avg Distance: {data['avg_euclidean']:.2f}")
|
||||
|
||||
# Sort by correlation (higher is better)
|
||||
sorted_by_corr = sorted(results.items(), key=lambda x: x[1]['correlation'], reverse=True)
|
||||
|
||||
print("\nBy Correlation (higher = more similar):")
|
||||
for i, (name, data) in enumerate(sorted_by_corr, 1):
|
||||
print(f" {i}. {name:30s} - Correlation: {data['correlation']:.4f}")
|
||||
|
||||
# Sort by RMSE
|
||||
sorted_by_rmse = sorted(results.items(), key=lambda x: x[1]['rmse'])
|
||||
|
||||
print("\nBy RMSE (lower = more similar):")
|
||||
for i, (name, data) in enumerate(sorted_by_rmse, 1):
|
||||
print(f" {i}. {name:30s} - RMSE: {data['rmse']:.2f}")
|
||||
|
||||
# Show protocols with largest differences for the best match
|
||||
print("\n" + "="*80)
|
||||
best_match_name, best_match_data = sorted_by_euclidean[0]
|
||||
print(f"Top 10 protocols with largest differences from {best_match_name}:")
|
||||
print("="*80)
|
||||
|
||||
if best_match_data['protocols'] is not None:
|
||||
distances = best_match_data['per_protocol_distances']
|
||||
protocols = best_match_data['protocols']
|
||||
top_diff_indices = np.argsort(distances)[-10:][::-1]
|
||||
|
||||
for idx in top_diff_indices:
|
||||
print(f" {protocols[idx]:50s} - Distance: {distances[idx]:.2f}")
|
||||
|
||||
return results
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Define file paths
|
||||
reference_file = "analysis_output_manual.csv"
|
||||
comparison_files = [
|
||||
"analysis_output_gemma3-12b.csv",
|
||||
"analysis_output_gpt-oss.csv",
|
||||
"analysis_output_mistral.csv"
|
||||
]
|
||||
|
||||
# Check if files exist
|
||||
if not Path(reference_file).exists():
|
||||
print(f"Error: Reference file '{reference_file}' not found")
|
||||
exit(1)
|
||||
|
||||
for file in comparison_files:
|
||||
if not Path(file).exists():
|
||||
print(f"Warning: Comparison file '{file}' not found, skipping...")
|
||||
comparison_files.remove(file)
|
||||
|
||||
if not comparison_files:
|
||||
print("Error: No comparison files found")
|
||||
exit(1)
|
||||
|
||||
# Run comparison
|
||||
results = compare_analyses(reference_file, comparison_files)
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("Analysis complete!")
|
||||
print("="*80)
|
||||
Reference in New Issue
Block a user