#!/usr/bin/env python3 """ Comprehensive review of the analysis for errors and inconsistencies. """ import pandas as pd import numpy as np from pathlib import Path print("=" * 80) print("ANALYSIS REVIEW - ERROR CHECKING") print("=" * 80) # Load data df = pd.read_csv('diagnostic_output.csv') clusters = pd.read_csv('analysis_results/data/kmeans_clusters.csv') pca_coords = pd.read_csv('analysis_results/data/pca_coordinates.csv') # Identify dimension columns design_cols = [c for c in df.columns if c.startswith('Design_')] entanglement_cols = [c for c in df.columns if c.startswith('Entanglement_')] experience_cols = [c for c in df.columns if c.startswith('Experience_')] dimension_cols = design_cols + entanglement_cols + experience_cols errors_found = [] warnings_found = [] print("\n1. DATA COMPLETENESS CHECK") print("-" * 80) # Check 1: Missing values missing_count = df[dimension_cols].isna().sum().sum() rows_with_missing = df[dimension_cols].isna().any(axis=1).sum() print(f"✓ Total protocols in source data: {len(df)}") print(f"✓ Protocols with complete data: {len(df) - rows_with_missing}") print(f"✓ Protocols with missing values: {rows_with_missing}") print(f"✓ Protocols in cluster analysis: {len(clusters)}") if rows_with_missing > 0: warnings_found.append(f"{rows_with_missing} protocols excluded due to missing values") missing_protocols = df[df[dimension_cols].isna().any(axis=1)]['Descriptor'].tolist() print(f"\n Excluded protocols: {', '.join(missing_protocols)}") # Check 2: Descriptor consistency merged = df.merge(clusters, on='Descriptor', how='inner') if len(merged) != len(clusters): errors_found.append(f"Descriptor mismatch: {len(merged)} matched vs {len(clusters)} expected") else: print(f"✓ All cluster descriptors match source data") print("\n2. DATA QUALITY CHECK") print("-" * 80) # Check 3: Value ranges (should be 1-9 for dimensions) for col in dimension_cols: values = df[col].dropna() if values.min() < 1 or values.max() > 9: errors_found.append(f"Column {col} has out-of-range values: [{values.min()}, {values.max()}]") print(f"✓ All dimension values within expected range [1, 9]") # Check 4: Variance check - dimensions with very low variance df_clean = df.dropna(subset=dimension_cols) variances = df_clean[dimension_cols].var() low_var_dims = variances[variances < 1.0] if len(low_var_dims) > 0: warnings_found.append(f"{len(low_var_dims)} dimensions have very low variance (< 1.0)") print(f"\n Low variance dimensions:") for dim, var in low_var_dims.items(): print(f" - {dim}: {var:.3f}") else: print(f"✓ All dimensions have reasonable variance") print("\n3. CLUSTERING VALIDATION") print("-" * 80) # Check 5: Cluster balance cluster_sizes = clusters['cluster'].value_counts().sort_index() print(f"✓ Cluster 1: {cluster_sizes[1]} protocols ({cluster_sizes[1]/len(clusters)*100:.1f}%)") print(f"✓ Cluster 2: {cluster_sizes[2]} protocols ({cluster_sizes[2]/len(clusters)*100:.1f}%)") imbalance_ratio = max(cluster_sizes) / min(cluster_sizes) if imbalance_ratio > 2.0: warnings_found.append(f"Cluster imbalance ratio is {imbalance_ratio:.2f} (ideally < 2.0)") # Check 6: PCA coordinates consistency if len(pca_coords) != len(clusters): errors_found.append(f"PCA coordinates count ({len(pca_coords)}) != cluster count ({len(clusters)})") else: print(f"✓ PCA coordinates match cluster count") # Check 7: Verify PCA loadings sum pca_loadings = pd.read_csv('analysis_results/data/pca_loadings.csv', index_col=0) if pca_loadings.shape[0] != 23: errors_found.append(f"PCA loadings have {pca_loadings.shape[0]} rows, expected 23") else: print(f"✓ PCA loadings have correct dimensions") print("\n4. STATISTICAL VALIDITY") print("-" * 80) # Check 8: Correlation matrix corr_matrix = pd.read_csv('analysis_results/data/correlation_matrix.csv', index_col=0) # Check for perfect correlations (excluding diagonal) np.fill_diagonal(corr_matrix.values, 0) perfect_corrs = np.where(np.abs(corr_matrix.values) > 0.99) if len(perfect_corrs[0]) > 0: warnings_found.append(f"Found {len(perfect_corrs[0])} near-perfect correlations between dimensions") else: print(f"✓ No perfect correlations found (multicollinearity check)") # Check 9: Correlation matrix symmetry try: if corr_matrix.shape[0] == corr_matrix.shape[1]: if not np.allclose(corr_matrix.values, corr_matrix.values.T, equal_nan=True): errors_found.append("Correlation matrix is not symmetric") else: print(f"✓ Correlation matrix is symmetric") else: errors_found.append(f"Correlation matrix is not square: {corr_matrix.shape}") except Exception as e: warnings_found.append(f"Could not verify correlation matrix symmetry: {e}") print("\n5. AVERAGE VALUES CHECK") print("-" * 80) # Check 10: Verify average calculation df_clean = df.dropna(subset=dimension_cols) calculated_averages = df_clean[dimension_cols].mean(axis=1) print(f"✓ Average values range: [{calculated_averages.min():.2f}, {calculated_averages.max():.2f}]") print(f"✓ Mean of averages: {calculated_averages.mean():.2f}") print(f"✓ Std of averages: {calculated_averages.std():.2f}") # Check for suspiciously uniform distribution from scipy import stats # Test for uniformity using chi-square test bins = np.arange(int(calculated_averages.min()), int(calculated_averages.max()) + 1, 0.5) observed_counts, _ = np.histogram(calculated_averages, bins=bins) expected_count = len(calculated_averages) / len(bins[:-1]) chi2_stat = np.sum((observed_counts - expected_count)**2 / expected_count) p_value = 1 - stats.chi2.cdf(chi2_stat, len(bins) - 2) print(f"✓ Distribution uniformity test p-value: {p_value:.4f}") if p_value < 0.05: print(f" (Distribution is significantly non-uniform, as expected for real data)") else: warnings_found.append("Average values may be too uniformly distributed (p > 0.05)") print("\n6. CLUSTER SEPARATION CHECK") print("-" * 80) # Check 11: Verify cluster separation is meaningful merged = df_clean.merge(clusters, on='Descriptor') cluster1_means = merged[merged['cluster'] == 1][dimension_cols].mean() cluster2_means = merged[merged['cluster'] == 2][dimension_cols].mean() differences = (cluster1_means - cluster2_means).abs() significant_diffs = differences[differences > 0.5] print(f"✓ Dimensions with meaningful difference (>0.5) between clusters: {len(significant_diffs)}/23") if len(significant_diffs) < 5: warnings_found.append(f"Only {len(significant_diffs)} dimensions show meaningful separation between clusters") # Show top differentiating dimensions print(f"\n Top 5 differentiating dimensions:") for dim in differences.nlargest(5).index: print(f" - {dim}: {differences[dim]:.3f}") print("\n" + "=" * 80) print("SUMMARY") print("=" * 80) if len(errors_found) == 0: print("✓ No critical errors found!") else: print(f"✗ {len(errors_found)} CRITICAL ERROR(S) FOUND:") for i, error in enumerate(errors_found, 1): print(f" {i}. {error}") if len(warnings_found) == 0: print("✓ No warnings!") else: print(f"\n⚠ {len(warnings_found)} WARNING(S):") for i, warning in enumerate(warnings_found, 1): print(f" {i}. {warning}") print("\n" + "=" * 80) print("REVIEW COMPLETE") print("=" * 80)