Initial analysis complete

This commit is contained in:
Nathan Schneider
2025-11-16 23:47:10 -07:00
parent 815ed9d6f4
commit dcfd37fa4c
55 changed files with 7173 additions and 450 deletions

View File

@@ -0,0 +1,186 @@
#!/usr/bin/env python3
"""
Compare multiple analysis CSV files to determine which most closely resembles a reference file.
Uses Euclidean distance, correlation, and RMSE metrics.
"""
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from pathlib import Path
def calculate_euclidean_distance(df1, df2, numeric_cols):
"""Calculate Euclidean distance between two dataframes."""
distances = []
for idx in df1.index:
diff = df1.loc[idx, numeric_cols] - df2.loc[idx, numeric_cols]
# Use nansum to ignore NaN values
distance = np.sqrt(np.nansum(diff ** 2))
distances.append(distance)
return np.array(distances)
def calculate_rmse(df1, df2, numeric_cols):
"""Calculate Root Mean Squared Error."""
diff = df1[numeric_cols] - df2[numeric_cols]
# Use nanmean to ignore NaN values
mse = np.nanmean(diff.values ** 2)
return np.sqrt(mse)
def calculate_correlation(df1, df2, numeric_cols):
"""Calculate Pearson correlation across all numeric values."""
vals1 = df1[numeric_cols].values.flatten()
vals2 = df2[numeric_cols].values.flatten()
# Remove NaN values - only use positions where both have valid values
mask = ~(np.isnan(vals1) | np.isnan(vals2))
vals1_clean = vals1[mask]
vals2_clean = vals2[mask]
if len(vals1_clean) < 2:
return np.nan, np.nan
corr, pvalue = pearsonr(vals1_clean, vals2_clean)
return corr, pvalue
def compare_analyses(reference_file, comparison_files):
"""Compare multiple analysis files to a reference file."""
# Read reference file
print(f"Reading reference file: {reference_file}")
ref_df = pd.read_csv(reference_file, quotechar='"', escapechar='\\', engine='python')
# Get numeric columns (all the rating dimensions)
numeric_cols = [col for col in ref_df.columns if
col.startswith(('Design_', 'Entanglement_', 'Experience_'))]
# Convert numeric columns to numeric type, coercing errors to NaN
for col in numeric_cols:
ref_df[col] = pd.to_numeric(ref_df[col], errors='coerce')
print(f"\nFound {len(numeric_cols)} numeric dimensions to compare")
print(f"Comparing {len(ref_df)} protocols\n")
print("="*80)
results = {}
for comp_file in comparison_files:
print(f"\nComparing: {Path(comp_file).name}")
print("-"*80)
# Read comparison file
comp_df = pd.read_csv(comp_file, quotechar='"', escapechar='\\', engine='python')
# Convert numeric columns to numeric type, coercing errors to NaN
for col in numeric_cols:
comp_df[col] = pd.to_numeric(comp_df[col], errors='coerce')
# Ensure same protocols in same order (match by Descriptor)
if 'Descriptor' in ref_df.columns and 'Descriptor' in comp_df.columns:
# Use merge to ensure exact matching - only keep protocols in ref_df
comp_df = pd.merge(
ref_df[['Descriptor']],
comp_df,
on='Descriptor',
how='left'
)
# Calculate Euclidean distances using reset indices to ensure alignment
ref_temp = ref_df.reset_index(drop=True)
comp_temp = comp_df.reset_index(drop=True)
euclidean_distances = calculate_euclidean_distance(ref_temp, comp_temp, numeric_cols)
total_euclidean = np.sum(euclidean_distances)
avg_euclidean = np.mean(euclidean_distances)
# Calculate RMSE
rmse = calculate_rmse(ref_temp, comp_temp, numeric_cols)
# Calculate correlation
correlation, p_value = calculate_correlation(ref_temp, comp_temp, numeric_cols)
# Store results
results[Path(comp_file).name] = {
'total_euclidean': total_euclidean,
'avg_euclidean': avg_euclidean,
'rmse': rmse,
'correlation': correlation,
'p_value': p_value,
'per_protocol_distances': euclidean_distances,
'protocols': ref_df['Descriptor'].values if 'Descriptor' in ref_df.columns else None
}
# Print results
print(f" Total Euclidean Distance: {total_euclidean:.2f}")
print(f" Average Euclidean Distance: {avg_euclidean:.2f}")
print(f" RMSE: {rmse:.2f}")
print(f" Pearson Correlation: {correlation:.4f} (p={p_value:.2e})")
# Summary comparison
print("\n" + "="*80)
print("SUMMARY RANKING (lower distance = more similar)")
print("="*80)
# Sort by average Euclidean distance
sorted_by_euclidean = sorted(results.items(), key=lambda x: x[1]['avg_euclidean'])
print("\nBy Average Euclidean Distance:")
for i, (name, data) in enumerate(sorted_by_euclidean, 1):
print(f" {i}. {name:30s} - Avg Distance: {data['avg_euclidean']:.2f}")
# Sort by correlation (higher is better)
sorted_by_corr = sorted(results.items(), key=lambda x: x[1]['correlation'], reverse=True)
print("\nBy Correlation (higher = more similar):")
for i, (name, data) in enumerate(sorted_by_corr, 1):
print(f" {i}. {name:30s} - Correlation: {data['correlation']:.4f}")
# Sort by RMSE
sorted_by_rmse = sorted(results.items(), key=lambda x: x[1]['rmse'])
print("\nBy RMSE (lower = more similar):")
for i, (name, data) in enumerate(sorted_by_rmse, 1):
print(f" {i}. {name:30s} - RMSE: {data['rmse']:.2f}")
# Show protocols with largest differences for the best match
print("\n" + "="*80)
best_match_name, best_match_data = sorted_by_euclidean[0]
print(f"Top 10 protocols with largest differences from {best_match_name}:")
print("="*80)
if best_match_data['protocols'] is not None:
distances = best_match_data['per_protocol_distances']
protocols = best_match_data['protocols']
top_diff_indices = np.argsort(distances)[-10:][::-1]
for idx in top_diff_indices:
print(f" {protocols[idx]:50s} - Distance: {distances[idx]:.2f}")
return results
if __name__ == "__main__":
# Define file paths
reference_file = "analysis_output_manual.csv"
comparison_files = [
"analysis_output_gemma3-12b.csv",
"analysis_output_gpt-oss.csv",
"analysis_output_mistral.csv"
]
# Check if files exist
if not Path(reference_file).exists():
print(f"Error: Reference file '{reference_file}' not found")
exit(1)
for file in comparison_files:
if not Path(file).exists():
print(f"Warning: Comparison file '{file}' not found, skipping...")
comparison_files.remove(file)
if not comparison_files:
print("Error: No comparison files found")
exit(1)
# Run comparison
results = compare_analyses(reference_file, comparison_files)
print("\n" + "="*80)
print("Analysis complete!")
print("="*80)