- Move all scripts to scripts/, web assets to web/, analysis results into self-contained data/readings/<type>_<YYYYMMDD>/ directories - Add data/readings/manual_20260320/ with 32 JSON readings from git.medlab.host/ntnsndr/protocol-bicorder-data - Add scripts/json_to_csv.py to convert bicorder JSON files to CSV - Add scripts/sync_readings.sh for one-command sync + re-analysis of any dataset backed by a .sync_source config file - Add scripts/classify_readings.py to apply the LDA classifier to all readings and save per-reading cluster assignments - Add --min-coverage flag to multivariate_analysis.py for sparse/shortform datasets; also applies in lda_visualization.py - Fix lda_visualization.py NaN handling and 0-d array annotation bug - Update README.md and WORKFLOW.md to document datasets, sync workflow, shortform handling, and new scripts Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
207 lines
8.3 KiB
Python
207 lines
8.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Comprehensive review of the analysis for errors and inconsistencies.
|
|
|
|
Usage:
|
|
python3 scripts/review_analysis.py data/readings/synthetic_20251116.csv
|
|
python3 scripts/review_analysis.py data/readings/manual_20260101.csv --results-dir analysis_results/manual_20260101
|
|
"""
|
|
|
|
import argparse
|
|
import pandas as pd
|
|
import numpy as np
|
|
from pathlib import Path
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Check analysis results for errors and inconsistencies',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Example usage:
|
|
python3 scripts/review_analysis.py data/readings/synthetic_20251116/readings.csv
|
|
python3 scripts/review_analysis.py data/readings/manual_20260101/readings.csv --analysis-dir data/readings/manual_20260101/analysis
|
|
"""
|
|
)
|
|
parser.add_argument('input_csv', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
|
|
parser.add_argument('--analysis-dir', default=None,
|
|
help='Analysis directory (default: <dataset_dir>/analysis)')
|
|
args = parser.parse_args()
|
|
|
|
dataset_dir = Path(args.input_csv).parent
|
|
results_dir = Path(args.analysis_dir) if args.analysis_dir else dataset_dir / 'analysis'
|
|
|
|
print("=" * 80)
|
|
print("ANALYSIS REVIEW - ERROR CHECKING")
|
|
print("=" * 80)
|
|
print(f"Dataset: {args.input_csv}")
|
|
print(f"Results: {results_dir}")
|
|
|
|
# Load data
|
|
df = pd.read_csv(args.input_csv)
|
|
clusters = pd.read_csv(results_dir / 'data' / 'kmeans_clusters.csv')
|
|
pca_coords = pd.read_csv(results_dir / 'data' / 'pca_coordinates.csv')
|
|
|
|
# Identify dimension columns
|
|
design_cols = [c for c in df.columns if c.startswith('Design_')]
|
|
entanglement_cols = [c for c in df.columns if c.startswith('Entanglement_')]
|
|
experience_cols = [c for c in df.columns if c.startswith('Experience_')]
|
|
dimension_cols = design_cols + entanglement_cols + experience_cols
|
|
|
|
errors_found = []
|
|
warnings_found = []
|
|
|
|
print("\n1. DATA COMPLETENESS CHECK")
|
|
print("-" * 80)
|
|
|
|
missing_count = df[dimension_cols].isna().sum().sum()
|
|
rows_with_missing = df[dimension_cols].isna().any(axis=1).sum()
|
|
|
|
print(f"✓ Total protocols in source data: {len(df)}")
|
|
print(f"✓ Protocols with complete data: {len(df) - rows_with_missing}")
|
|
print(f"✓ Protocols with missing values: {rows_with_missing}")
|
|
print(f"✓ Protocols in cluster analysis: {len(clusters)}")
|
|
|
|
if rows_with_missing > 0:
|
|
warnings_found.append(f"{rows_with_missing} protocols excluded due to missing values")
|
|
missing_protocols = df[df[dimension_cols].isna().any(axis=1)]['Descriptor'].tolist()
|
|
print(f"\n Excluded protocols: {', '.join(missing_protocols)}")
|
|
|
|
merged = df.merge(clusters, on='Descriptor', how='inner')
|
|
if len(merged) != len(clusters):
|
|
errors_found.append(f"Descriptor mismatch: {len(merged)} matched vs {len(clusters)} expected")
|
|
else:
|
|
print(f"✓ All cluster descriptors match source data")
|
|
|
|
print("\n2. DATA QUALITY CHECK")
|
|
print("-" * 80)
|
|
|
|
for col in dimension_cols:
|
|
values = df[col].dropna()
|
|
if values.min() < 1 or values.max() > 9:
|
|
errors_found.append(f"Column {col} has out-of-range values: [{values.min()}, {values.max()}]")
|
|
|
|
print(f"✓ All dimension values within expected range [1, 9]")
|
|
|
|
df_clean = df.dropna(subset=dimension_cols)
|
|
variances = df_clean[dimension_cols].var()
|
|
low_var_dims = variances[variances < 1.0]
|
|
if len(low_var_dims) > 0:
|
|
warnings_found.append(f"{len(low_var_dims)} dimensions have very low variance (< 1.0)")
|
|
print(f"\n Low variance dimensions:")
|
|
for dim, var in low_var_dims.items():
|
|
print(f" - {dim}: {var:.3f}")
|
|
else:
|
|
print(f"✓ All dimensions have reasonable variance")
|
|
|
|
print("\n3. CLUSTERING VALIDATION")
|
|
print("-" * 80)
|
|
|
|
cluster_sizes = clusters['cluster'].value_counts().sort_index()
|
|
print(f"✓ Cluster 1: {cluster_sizes[1]} protocols ({cluster_sizes[1]/len(clusters)*100:.1f}%)")
|
|
print(f"✓ Cluster 2: {cluster_sizes[2]} protocols ({cluster_sizes[2]/len(clusters)*100:.1f}%)")
|
|
|
|
imbalance_ratio = max(cluster_sizes) / min(cluster_sizes)
|
|
if imbalance_ratio > 2.0:
|
|
warnings_found.append(f"Cluster imbalance ratio is {imbalance_ratio:.2f} (ideally < 2.0)")
|
|
|
|
if len(pca_coords) != len(clusters):
|
|
errors_found.append(f"PCA coordinates count ({len(pca_coords)}) != cluster count ({len(clusters)})")
|
|
else:
|
|
print(f"✓ PCA coordinates match cluster count")
|
|
|
|
pca_loadings = pd.read_csv(results_dir / 'data' / 'pca_loadings.csv', index_col=0)
|
|
if pca_loadings.shape[0] != 23:
|
|
errors_found.append(f"PCA loadings have {pca_loadings.shape[0]} rows, expected 23")
|
|
else:
|
|
print(f"✓ PCA loadings have correct dimensions")
|
|
|
|
print("\n4. STATISTICAL VALIDITY")
|
|
print("-" * 80)
|
|
|
|
corr_matrix = pd.read_csv(results_dir / 'data' / 'correlation_matrix.csv', index_col=0)
|
|
|
|
np.fill_diagonal(corr_matrix.values, 0)
|
|
perfect_corrs = np.where(np.abs(corr_matrix.values) > 0.99)
|
|
if len(perfect_corrs[0]) > 0:
|
|
warnings_found.append(f"Found {len(perfect_corrs[0])} near-perfect correlations between dimensions")
|
|
else:
|
|
print(f"✓ No perfect correlations found (multicollinearity check)")
|
|
|
|
try:
|
|
if corr_matrix.shape[0] == corr_matrix.shape[1]:
|
|
if not np.allclose(corr_matrix.values, corr_matrix.values.T, equal_nan=True):
|
|
errors_found.append("Correlation matrix is not symmetric")
|
|
else:
|
|
print(f"✓ Correlation matrix is symmetric")
|
|
else:
|
|
errors_found.append(f"Correlation matrix is not square: {corr_matrix.shape}")
|
|
except Exception as e:
|
|
warnings_found.append(f"Could not verify correlation matrix symmetry: {e}")
|
|
|
|
print("\n5. AVERAGE VALUES CHECK")
|
|
print("-" * 80)
|
|
|
|
df_clean = df.dropna(subset=dimension_cols)
|
|
calculated_averages = df_clean[dimension_cols].mean(axis=1)
|
|
print(f"✓ Average values range: [{calculated_averages.min():.2f}, {calculated_averages.max():.2f}]")
|
|
print(f"✓ Mean of averages: {calculated_averages.mean():.2f}")
|
|
print(f"✓ Std of averages: {calculated_averages.std():.2f}")
|
|
|
|
from scipy import stats
|
|
bins = np.arange(int(calculated_averages.min()), int(calculated_averages.max()) + 1, 0.5)
|
|
observed_counts, _ = np.histogram(calculated_averages, bins=bins)
|
|
expected_count = len(calculated_averages) / len(bins[:-1])
|
|
chi2_stat = np.sum((observed_counts - expected_count)**2 / expected_count)
|
|
p_value = 1 - stats.chi2.cdf(chi2_stat, len(bins) - 2)
|
|
|
|
print(f"✓ Distribution uniformity test p-value: {p_value:.4f}")
|
|
if p_value < 0.05:
|
|
print(f" (Distribution is significantly non-uniform, as expected for real data)")
|
|
else:
|
|
warnings_found.append("Average values may be too uniformly distributed (p > 0.05)")
|
|
|
|
print("\n6. CLUSTER SEPARATION CHECK")
|
|
print("-" * 80)
|
|
|
|
merged = df_clean.merge(clusters, on='Descriptor')
|
|
cluster1_means = merged[merged['cluster'] == 1][dimension_cols].mean()
|
|
cluster2_means = merged[merged['cluster'] == 2][dimension_cols].mean()
|
|
|
|
differences = (cluster1_means - cluster2_means).abs()
|
|
significant_diffs = differences[differences > 0.5]
|
|
print(f"✓ Dimensions with meaningful difference (>0.5) between clusters: {len(significant_diffs)}/23")
|
|
|
|
if len(significant_diffs) < 5:
|
|
warnings_found.append(f"Only {len(significant_diffs)} dimensions show meaningful separation between clusters")
|
|
|
|
print(f"\n Top 5 differentiating dimensions:")
|
|
for dim in differences.nlargest(5).index:
|
|
print(f" - {dim}: {differences[dim]:.3f}")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("SUMMARY")
|
|
print("=" * 80)
|
|
|
|
if len(errors_found) == 0:
|
|
print("✓ No critical errors found!")
|
|
else:
|
|
print(f"✗ {len(errors_found)} CRITICAL ERROR(S) FOUND:")
|
|
for i, error in enumerate(errors_found, 1):
|
|
print(f" {i}. {error}")
|
|
|
|
if len(warnings_found) == 0:
|
|
print("✓ No warnings!")
|
|
else:
|
|
print(f"\n⚠ {len(warnings_found)} WARNING(S):")
|
|
for i, warning in enumerate(warnings_found, 1):
|
|
print(f" {i}. {warning}")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("REVIEW COMPLETE")
|
|
print("=" * 80)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|