Reorganize directory, add manual dataset and sync tooling

- Move all scripts to scripts/, web assets to web/, analysis results
  into self-contained data/readings/<type>_<YYYYMMDD>/ directories
- Add data/readings/manual_20260320/ with 32 JSON readings from
  git.medlab.host/ntnsndr/protocol-bicorder-data
- Add scripts/json_to_csv.py to convert bicorder JSON files to CSV
- Add scripts/sync_readings.sh for one-command sync + re-analysis of
  any dataset backed by a .sync_source config file
- Add scripts/classify_readings.py to apply the LDA classifier to all
  readings and save per-reading cluster assignments
- Add --min-coverage flag to multivariate_analysis.py for sparse/shortform
  datasets; also applies in lda_visualization.py
- Fix lda_visualization.py NaN handling and 0-d array annotation bug
- Update README.md and WORKFLOW.md to document datasets, sync workflow,
  shortform handling, and new scripts

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Nathan Schneider
2026-03-20 17:35:13 -06:00
parent 0c794dddae
commit 897c30406b
545 changed files with 10715 additions and 718 deletions

View File

@@ -0,0 +1,858 @@
#!/usr/bin/env python3
"""
Multivariate Analysis Script for Protocol Bicorder Data
Performs comprehensive multivariate statistical analyses on protocol diagnostic data,
including clustering, dimensionality reduction, correlation analysis, and visualization.
Usage:
python3 multivariate_analysis.py diagnostic_output.csv [--analyses all]
python3 multivariate_analysis.py diagnostic_output.csv --analyses clustering pca
"""
import argparse
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import pdist, squareform
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import silhouette_score, davies_bouldin_score
try:
import umap
UMAP_AVAILABLE = True
except ImportError:
UMAP_AVAILABLE = False
print("Note: UMAP not available. Install with: pip install umap-learn")
class ProtocolAnalyzer:
"""Main class for multivariate analysis of protocol data."""
def __init__(self, csv_path, output_dir='analysis_results', min_coverage=0.0):
"""Initialize analyzer with data and output directory.
Args:
csv_path: Path to diagnostic CSV file
output_dir: Directory for analysis output
min_coverage: Drop dimension columns with fewer than this fraction of
non-null values (0.01.0). Useful for sparse/shortform
datasets. E.g. 0.8 keeps only columns with ≥80% coverage.
"""
self.csv_path = Path(csv_path)
self.output_dir = Path(output_dir)
self.min_coverage = min_coverage
self.output_dir.mkdir(exist_ok=True)
# Create subdirectories
(self.output_dir / 'plots').mkdir(exist_ok=True)
(self.output_dir / 'data').mkdir(exist_ok=True)
(self.output_dir / 'reports').mkdir(exist_ok=True)
# Load and prepare data
self.df = None
self.dimension_cols = []
self.design_cols = []
self.entanglement_cols = []
self.experience_cols = []
self.scaled_data = None
self.scaler = None
self._load_data()
def _load_data(self):
"""Load CSV and identify dimension columns."""
print(f"Loading data from {self.csv_path}...")
self.df = pd.read_csv(self.csv_path)
# Identify dimension columns
all_cols = self.df.columns.tolist()
self.design_cols = [c for c in all_cols if c.startswith('Design_')]
self.entanglement_cols = [c for c in all_cols if c.startswith('Entanglement_')]
self.experience_cols = [c for c in all_cols if c.startswith('Experience_')]
self.dimension_cols = self.design_cols + self.entanglement_cols + self.experience_cols
print(f"Loaded {len(self.df)} protocols with {len(self.dimension_cols)} dimensions")
print(f" - Design: {len(self.design_cols)}")
print(f" - Entanglement: {len(self.entanglement_cols)}")
print(f" - Experience: {len(self.experience_cols)}")
# Drop low-coverage columns if min_coverage is set
if self.min_coverage > 0.0:
n = len(self.df)
coverage = self.df[self.dimension_cols].notna().sum() / n
dropped = [c for c in self.dimension_cols if coverage[c] < self.min_coverage]
if dropped:
print(f"\nDropping {len(dropped)} dimension(s) below {self.min_coverage:.0%} coverage:")
for c in dropped:
print(f" - {c}: {coverage[c]:.0%}")
self.dimension_cols = [c for c in self.dimension_cols if c not in dropped]
self.design_cols = [c for c in self.design_cols if c not in dropped]
self.entanglement_cols = [c for c in self.entanglement_cols if c not in dropped]
self.experience_cols = [c for c in self.experience_cols if c not in dropped]
print(f"Remaining dimensions: {len(self.dimension_cols)}")
# Check for missing values
missing_count = self.df[self.dimension_cols].isna().sum().sum()
rows_with_missing = self.df[self.dimension_cols].isna().any(axis=1).sum()
if missing_count > 0:
print(f"\nWarning: Found {missing_count} missing values in {rows_with_missing} rows")
print("Dropping rows with missing values...")
self.df = self.df.dropna(subset=self.dimension_cols)
print(f"Dataset now contains {len(self.df)} protocols")
# Standardize the dimension data
self.scaler = StandardScaler()
self.scaled_data = self.scaler.fit_transform(self.df[self.dimension_cols])
def save_results(self, data, filename, subdir='data'):
"""Save results to CSV file."""
output_path = self.output_dir / subdir / filename
if isinstance(data, pd.DataFrame):
data.to_csv(output_path, index=False)
else:
pd.DataFrame(data).to_csv(output_path)
print(f" Saved: {output_path}")
def save_plot(self, filename):
"""Save current matplotlib figure."""
output_path = self.output_dir / 'plots' / filename
plt.tight_layout()
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f" Saved: {output_path}")
plt.close()
# ========== CLUSTERING ANALYSES ==========
def kmeans_clustering(self, n_clusters_range=(2, 10)):
"""Perform K-means clustering with elbow method."""
print("\n=== K-Means Clustering ===")
# Elbow method
inertias = []
silhouettes = []
k_range = range(n_clusters_range[0], n_clusters_range[1] + 1)
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = kmeans.fit_predict(self.scaled_data)
inertias.append(kmeans.inertia_)
if k > 1:
silhouettes.append(silhouette_score(self.scaled_data, labels))
else:
silhouettes.append(0)
# Plot elbow curve
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ax1.plot(k_range, inertias, 'bo-')
ax1.set_xlabel('Number of Clusters (k)')
ax1.set_ylabel('Inertia')
ax1.set_title('Elbow Method for Optimal k')
ax1.grid(True, alpha=0.3)
ax2.plot(k_range, silhouettes, 'ro-')
ax2.set_xlabel('Number of Clusters (k)')
ax2.set_ylabel('Silhouette Score')
ax2.set_title('Silhouette Score by k')
ax2.grid(True, alpha=0.3)
self.save_plot('kmeans_elbow.png')
# Use optimal k (highest silhouette)
optimal_k = k_range[np.argmax(silhouettes)]
print(f"Optimal k by silhouette score: {optimal_k}")
# Final clustering
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
self.df['kmeans_cluster'] = kmeans.fit_predict(self.scaled_data)
# Save results
results = self.df[['Descriptor', 'kmeans_cluster']].copy()
results['cluster'] = results['kmeans_cluster'] + 1 # 1-indexed for readability
self.save_results(results[['Descriptor', 'cluster']], 'kmeans_clusters.csv')
# Cluster statistics
print(f"\nCluster sizes:")
print(self.df['kmeans_cluster'].value_counts().sort_index())
return self.df['kmeans_cluster']
def hierarchical_clustering(self, n_clusters=5, method='ward'):
"""Perform hierarchical clustering with dendrogram."""
print("\n=== Hierarchical Clustering ===")
# Compute linkage
Z = linkage(self.scaled_data, method=method)
# Plot dendrogram
plt.figure(figsize=(16, 8))
dendrogram(Z, labels=self.df['Descriptor'].values, leaf_font_size=8)
plt.title(f'Hierarchical Clustering Dendrogram ({method} linkage)')
plt.xlabel('Protocol')
plt.ylabel('Distance')
plt.xticks(rotation=90)
self.save_plot('hierarchical_dendrogram.png')
# Cut tree to get clusters
self.df['hierarchical_cluster'] = fcluster(Z, n_clusters, criterion='maxclust')
# Save results
results = self.df[['Descriptor', 'hierarchical_cluster']].copy()
results.columns = ['Descriptor', 'cluster']
self.save_results(results, 'hierarchical_clusters.csv')
print(f"\nCluster sizes:")
print(self.df['hierarchical_cluster'].value_counts().sort_index())
return self.df['hierarchical_cluster']
def dbscan_clustering(self, eps=3.0, min_samples=3):
"""Perform DBSCAN clustering to identify outliers."""
print("\n=== DBSCAN Clustering ===")
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
self.df['dbscan_cluster'] = dbscan.fit_predict(self.scaled_data)
n_clusters = len(set(self.df['dbscan_cluster'])) - (1 if -1 in self.df['dbscan_cluster'] else 0)
n_outliers = (self.df['dbscan_cluster'] == -1).sum()
print(f"Found {n_clusters} clusters and {n_outliers} outliers")
# Save results
results = self.df[['Descriptor', 'dbscan_cluster']].copy()
results.columns = ['Descriptor', 'cluster']
self.save_results(results, 'dbscan_clusters.csv')
if n_outliers > 0:
outliers = self.df[self.df['dbscan_cluster'] == -1][['Descriptor']]
self.save_results(outliers, 'dbscan_outliers.csv')
print("\nOutlier protocols:")
for protocol in outliers['Descriptor']:
print(f" - {protocol}")
return self.df['dbscan_cluster']
# ========== DIMENSIONALITY REDUCTION ==========
def pca_analysis(self, n_components=None):
"""Perform PCA and visualize results."""
print("\n=== Principal Component Analysis ===")
# Fit PCA
if n_components is None:
pca = PCA()
else:
pca = PCA(n_components=n_components)
pca_coords = pca.fit_transform(self.scaled_data)
# Explained variance
explained_var = pca.explained_variance_ratio_
cumsum_var = np.cumsum(explained_var)
print(f"First 5 PCs explain {cumsum_var[4]*100:.1f}% of variance")
# Plot explained variance
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
n_show = min(15, len(explained_var))
ax1.bar(range(1, n_show + 1), explained_var[:n_show])
ax1.set_xlabel('Principal Component')
ax1.set_ylabel('Explained Variance Ratio')
ax1.set_title('Variance Explained by Each PC')
ax1.grid(True, alpha=0.3, axis='y')
ax2.plot(range(1, n_show + 1), cumsum_var[:n_show], 'o-')
ax2.axhline(y=0.8, color='r', linestyle='--', alpha=0.5, label='80% threshold')
ax2.set_xlabel('Number of Components')
ax2.set_ylabel('Cumulative Explained Variance')
ax2.set_title('Cumulative Variance Explained')
ax2.legend()
ax2.grid(True, alpha=0.3)
self.save_plot('pca_variance.png')
# 2D visualization
plt.figure(figsize=(12, 10))
plt.scatter(pca_coords[:, 0], pca_coords[:, 1], alpha=0.6, s=50)
# Annotate points
for i, protocol in enumerate(self.df['Descriptor']):
if i % 3 == 0: # Label every 3rd point to avoid clutter
plt.annotate(protocol, (pca_coords[i, 0], pca_coords[i, 1]),
fontsize=6, alpha=0.7)
plt.xlabel(f'PC1 ({explained_var[0]*100:.1f}% variance)')
plt.ylabel(f'PC2 ({explained_var[1]*100:.1f}% variance)')
plt.title('Protocols in PCA Space (First 2 Components)')
plt.grid(True, alpha=0.3)
self.save_plot('pca_2d.png')
# Save PCA coordinates
pca_df = pd.DataFrame(pca_coords[:, :5],
columns=[f'PC{i+1}' for i in range(min(5, pca_coords.shape[1]))])
pca_df.insert(0, 'Descriptor', self.df['Descriptor'])
self.save_results(pca_df, 'pca_coordinates.csv')
# Component loadings
loadings = pd.DataFrame(
pca.components_[:5, :].T,
columns=[f'PC{i+1}' for i in range(min(5, pca.components_.shape[0]))],
index=self.dimension_cols
)
self.save_results(loadings, 'pca_loadings.csv')
# Plot loadings heatmap
plt.figure(figsize=(10, 12))
sns.heatmap(loadings, cmap='RdBu_r', center=0, cbar_kws={'label': 'Loading'})
plt.title('PCA Component Loadings')
plt.tight_layout()
self.save_plot('pca_loadings_heatmap.png')
return pca_coords, pca
def tsne_analysis(self, perplexity=30, n_components=2):
"""Perform t-SNE analysis."""
print("\n=== t-SNE Analysis ===")
tsne = TSNE(n_components=n_components, perplexity=perplexity, random_state=42, max_iter=1000)
tsne_coords = tsne.fit_transform(self.scaled_data)
# Plot
plt.figure(figsize=(12, 10))
plt.scatter(tsne_coords[:, 0], tsne_coords[:, 1], alpha=0.6, s=50)
# Annotate some points
for i, protocol in enumerate(self.df['Descriptor']):
if i % 4 == 0: # Label every 4th point
plt.annotate(protocol, (tsne_coords[i, 0], tsne_coords[i, 1]),
fontsize=6, alpha=0.7)
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.title(f't-SNE Projection (perplexity={perplexity})')
plt.grid(True, alpha=0.3)
self.save_plot('tsne_2d.png')
# Save coordinates
tsne_df = pd.DataFrame(tsne_coords, columns=['TSNE1', 'TSNE2'])
tsne_df.insert(0, 'Descriptor', self.df['Descriptor'])
self.save_results(tsne_df, 'tsne_coordinates.csv')
return tsne_coords
def umap_analysis(self, n_neighbors=15, min_dist=0.1, n_components=2):
"""Perform UMAP analysis if available."""
if not UMAP_AVAILABLE:
print("\n=== UMAP Analysis ===")
print("UMAP not available. Install with: pip install umap-learn")
return None
print("\n=== UMAP Analysis ===")
reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist,
n_components=n_components, random_state=42)
umap_coords = reducer.fit_transform(self.scaled_data)
# Plot
plt.figure(figsize=(12, 10))
plt.scatter(umap_coords[:, 0], umap_coords[:, 1], alpha=0.6, s=50)
# Annotate some points
for i, protocol in enumerate(self.df['Descriptor']):
if i % 4 == 0:
plt.annotate(protocol, (umap_coords[i, 0], umap_coords[i, 1]),
fontsize=6, alpha=0.7)
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.title(f'UMAP Projection (n_neighbors={n_neighbors}, min_dist={min_dist})')
plt.grid(True, alpha=0.3)
self.save_plot('umap_2d.png')
# Save coordinates
umap_df = pd.DataFrame(umap_coords, columns=['UMAP1', 'UMAP2'])
umap_df.insert(0, 'Descriptor', self.df['Descriptor'])
self.save_results(umap_df, 'umap_coordinates.csv')
return umap_coords
def factor_analysis(self, n_factors=5):
"""Perform factor analysis."""
print("\n=== Factor Analysis ===")
fa = FactorAnalysis(n_components=n_factors, random_state=42)
fa_coords = fa.fit_transform(self.scaled_data)
# Factor loadings
loadings = pd.DataFrame(
fa.components_.T,
columns=[f'Factor{i+1}' for i in range(n_factors)],
index=self.dimension_cols
)
self.save_results(loadings, 'factor_loadings.csv')
# Plot loadings heatmap
plt.figure(figsize=(10, 12))
sns.heatmap(loadings, cmap='RdBu_r', center=0, cbar_kws={'label': 'Loading'})
plt.title('Factor Analysis Loadings')
plt.tight_layout()
self.save_plot('factor_loadings_heatmap.png')
# Save factor scores
fa_df = pd.DataFrame(fa_coords,
columns=[f'Factor{i+1}' for i in range(n_factors)])
fa_df.insert(0, 'Descriptor', self.df['Descriptor'])
self.save_results(fa_df, 'factor_scores.csv')
return fa_coords, fa
# ========== CORRELATION & STRUCTURE ==========
def correlation_analysis(self):
"""Compute and visualize correlation matrices."""
print("\n=== Correlation Analysis ===")
# Full correlation matrix
corr_matrix = self.df[self.dimension_cols].corr()
# Plot full correlation heatmap
plt.figure(figsize=(16, 14))
sns.heatmap(corr_matrix, cmap='RdBu_r', center=0,
square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'})
plt.title('Correlation Matrix - All Dimensions')
plt.tight_layout()
self.save_plot('correlation_heatmap_full.png')
# Save correlation matrix
self.save_results(corr_matrix, 'correlation_matrix.csv')
# Find strongest correlations
corr_pairs = []
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
corr_pairs.append({
'Dimension1': corr_matrix.columns[i],
'Dimension2': corr_matrix.columns[j],
'Correlation': corr_matrix.iloc[i, j]
})
corr_df = pd.DataFrame(corr_pairs).sort_values('Correlation',
key=abs,
ascending=False)
self.save_results(corr_df.head(20), 'top_correlations.csv')
print("\nTop 5 positive correlations:")
for _, row in corr_df.head(5).iterrows():
print(f" {row['Dimension1']} <-> {row['Dimension2']}: {row['Correlation']:.3f}")
print("\nTop 5 negative correlations:")
for _, row in corr_df.tail(5).iterrows():
print(f" {row['Dimension1']} <-> {row['Dimension2']}: {row['Correlation']:.3f}")
# Within-category correlations
self._plot_category_correlation('Design', self.design_cols)
self._plot_category_correlation('Entanglement', self.entanglement_cols)
self._plot_category_correlation('Experience', self.experience_cols)
return corr_matrix
def _plot_category_correlation(self, category_name, columns):
"""Plot correlation heatmap for a specific category."""
corr = self.df[columns].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r', center=0,
square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'})
plt.title(f'{category_name} Dimensions - Correlation Matrix')
plt.tight_layout()
self.save_plot(f'correlation_heatmap_{category_name.lower()}.png')
def network_analysis(self, threshold=0.5):
"""Create network graph of protocol similarities."""
print("\n=== Network Analysis ===")
# Compute pairwise distances
distances = pdist(self.scaled_data, metric='euclidean')
dist_matrix = squareform(distances)
# Convert to similarity (inverse of distance, normalized)
max_dist = dist_matrix.max()
similarity_matrix = 1 - (dist_matrix / max_dist)
# Create network
G = nx.Graph()
# Add nodes
for i, protocol in enumerate(self.df['Descriptor']):
G.add_node(i, label=protocol)
# Add edges above threshold
edge_count = 0
for i in range(len(similarity_matrix)):
for j in range(i+1, len(similarity_matrix)):
if similarity_matrix[i, j] > threshold:
G.add_edge(i, j, weight=similarity_matrix[i, j])
edge_count += 1
print(f"Network with {G.number_of_nodes()} nodes and {edge_count} edges")
# Calculate network metrics
if G.number_of_edges() > 0:
degree_centrality = nx.degree_centrality(G)
betweenness = nx.betweenness_centrality(G)
metrics_df = pd.DataFrame({
'Descriptor': [self.df.iloc[i]['Descriptor'] for i in G.nodes()],
'Degree_Centrality': [degree_centrality[i] for i in G.nodes()],
'Betweenness_Centrality': [betweenness[i] for i in G.nodes()]
}).sort_values('Degree_Centrality', ascending=False)
self.save_results(metrics_df, 'network_metrics.csv')
print("\nTop 5 most central protocols:")
for _, row in metrics_df.head(5).iterrows():
print(f" {row['Descriptor']}: {row['Degree_Centrality']:.3f}")
# Plot network
plt.figure(figsize=(16, 16))
pos = nx.spring_layout(G, k=0.5, iterations=50, seed=42)
# Node sizes based on degree centrality
node_sizes = [degree_centrality[i] * 3000 + 100 for i in G.nodes()]
nx.draw_networkx_nodes(G, pos, node_size=node_sizes,
node_color='lightblue', alpha=0.7)
nx.draw_networkx_edges(G, pos, alpha=0.2)
# Labels for high-centrality nodes
high_centrality = {i: self.df.iloc[i]['Descriptor']
for i in G.nodes() if degree_centrality[i] > 0.1}
nx.draw_networkx_labels(G, pos, labels=high_centrality, font_size=8)
plt.title(f'Protocol Similarity Network (threshold={threshold})')
plt.axis('off')
plt.tight_layout()
self.save_plot('network_graph.png')
else:
print("No edges above threshold - try lowering the threshold")
return G
# ========== CLASSIFICATION & PREDICTION ==========
def category_discriminant_analysis(self):
"""Analyze how well dimension categories discriminate protocols."""
print("\n=== Category Discriminant Analysis ===")
results = []
for category_name, columns in [('Design', self.design_cols),
('Entanglement', self.entanglement_cols),
('Experience', self.experience_cols)]:
# Use one category to predict clustering from another
X = self.df[columns].values
# Use kmeans clusters as target if available
if 'kmeans_cluster' in self.df.columns:
y = self.df['kmeans_cluster'].values
# LDA
try:
lda = LinearDiscriminantAnalysis()
lda.fit(X, y)
score = lda.score(X, y)
results.append({
'Category': category_name,
'Accuracy': score,
'N_Dimensions': len(columns)
})
print(f"{category_name} dimensions predict clusters with {score*100:.1f}% accuracy")
except:
print(f"Could not perform LDA for {category_name}")
if results:
results_df = pd.DataFrame(results)
self.save_results(results_df, 'category_discriminant_results.csv')
return results
def feature_importance_analysis(self):
"""Analyze which dimensions are most important for clustering."""
print("\n=== Feature Importance Analysis ===")
if 'kmeans_cluster' not in self.df.columns:
print("Run clustering first to enable feature importance analysis")
return None
# Random Forest classifier
X = self.df[self.dimension_cols].values
y = self.df['kmeans_cluster'].values
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
# Feature importances
importances = pd.DataFrame({
'Dimension': self.dimension_cols,
'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)
self.save_results(importances, 'feature_importances.csv')
# Plot top 20
plt.figure(figsize=(10, 12))
top_20 = importances.head(20)
plt.barh(range(len(top_20)), top_20['Importance'])
plt.yticks(range(len(top_20)), top_20['Dimension'])
plt.xlabel('Importance')
plt.title('Top 20 Most Important Dimensions for Clustering')
plt.gca().invert_yaxis()
plt.tight_layout()
self.save_plot('feature_importances.png')
print("\nTop 10 most important dimensions:")
for _, row in importances.head(10).iterrows():
print(f" {row['Dimension']}: {row['Importance']:.4f}")
return importances
def analyst_comparison(self):
"""Compare ratings across different analysts."""
print("\n=== Analyst Comparison ===")
if 'analyst' not in self.df.columns:
print("No analyst column found")
return None
analysts = self.df['analyst'].unique()
print(f"Found {len(analysts)} unique analysts")
# Mean ratings by analyst for each dimension
analyst_means = self.df.groupby('analyst')[self.dimension_cols].mean()
self.save_results(analyst_means, 'analyst_mean_ratings.csv')
# Plot comparison
fig, axes = plt.subplots(3, 1, figsize=(14, 12))
for idx, (category_name, columns) in enumerate([
('Design', self.design_cols),
('Entanglement', self.entanglement_cols),
('Experience', self.experience_cols)
]):
analyst_means[columns].T.plot(ax=axes[idx], marker='o')
axes[idx].set_title(f'{category_name} Dimensions - Mean Ratings by Analyst')
axes[idx].set_ylabel('Mean Rating')
axes[idx].legend(title='Analyst', bbox_to_anchor=(1.05, 1), loc='upper left')
axes[idx].grid(True, alpha=0.3)
axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=45, ha='right')
plt.tight_layout()
self.save_plot('analyst_comparison.png')
return analyst_means
# ========== SUMMARY REPORT ==========
def generate_summary_report(self):
"""Generate a text summary of all analyses."""
print("\n=== Generating Summary Report ===")
report_lines = []
report_lines.append("=" * 80)
report_lines.append("MULTIVARIATE ANALYSIS SUMMARY REPORT")
report_lines.append("Protocol Bicorder Dataset")
report_lines.append("=" * 80)
report_lines.append("")
report_lines.append(f"Dataset: {self.csv_path}")
report_lines.append(f"Number of protocols: {len(self.df)}")
report_lines.append(f"Number of dimensions: {len(self.dimension_cols)}")
report_lines.append(f" - Design: {len(self.design_cols)}")
report_lines.append(f" - Entanglement: {len(self.entanglement_cols)}")
report_lines.append(f" - Experience: {len(self.experience_cols)}")
report_lines.append("")
report_lines.append("-" * 80)
report_lines.append("ANALYSES PERFORMED")
report_lines.append("-" * 80)
# Check which analyses were run
analyses_run = []
if 'kmeans_cluster' in self.df.columns:
analyses_run.append("- K-Means Clustering")
report_lines.append(f"K-Means: {len(self.df['kmeans_cluster'].unique())} clusters identified")
if 'hierarchical_cluster' in self.df.columns:
analyses_run.append("- Hierarchical Clustering")
report_lines.append(f"Hierarchical: {len(self.df['hierarchical_cluster'].unique())} clusters")
if 'dbscan_cluster' in self.df.columns:
analyses_run.append("- DBSCAN Clustering")
n_outliers = (self.df['dbscan_cluster'] == -1).sum()
report_lines.append(f"DBSCAN: {n_outliers} outlier protocols identified")
report_lines.append("")
report_lines.append("Dimensionality Reduction:")
report_lines.append("- Principal Component Analysis (PCA)")
report_lines.append("- t-SNE Projection")
if UMAP_AVAILABLE:
report_lines.append("- UMAP Projection")
report_lines.append("- Factor Analysis")
report_lines.append("")
report_lines.append("Statistical Analyses:")
report_lines.append("- Correlation Analysis")
report_lines.append("- Network Analysis")
report_lines.append("- Feature Importance Analysis")
if 'analyst' in self.df.columns:
report_lines.append("- Analyst Comparison")
report_lines.append("")
report_lines.append("-" * 80)
report_lines.append("OUTPUT FILES")
report_lines.append("-" * 80)
report_lines.append(f"All results saved to: {self.output_dir}/")
report_lines.append(" - plots/ : All visualizations (PNG)")
report_lines.append(" - data/ : All numerical results (CSV)")
report_lines.append(" - reports/ : This summary report")
report_lines.append("")
report_lines.append("=" * 80)
report_lines.append("END OF REPORT")
report_lines.append("=" * 80)
report_text = "\n".join(report_lines)
# Save report
report_path = self.output_dir / 'reports' / 'analysis_summary.txt'
with open(report_path, 'w') as f:
f.write(report_text)
print(f" Saved: {report_path}")
print("\n" + report_text)
return report_text
def main():
"""Main execution function."""
parser = argparse.ArgumentParser(
description='Multivariate analysis of Protocol Bicorder data',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv
python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv --output data/readings/synthetic_20251116/analysis
python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv --analyses clustering pca
"""
)
parser.add_argument('csv_file', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
parser.add_argument('--output', '-o', default=None,
help='Output directory (default: <dataset_dir>/analysis)')
parser.add_argument('--min-coverage', type=float, default=0.0,
help='Drop dimension columns below this coverage fraction (0.01.0). '
'E.g. 0.8 keeps only columns ≥80%% complete. '
'Useful for sparse/shortform datasets (default: 0.0, keep all)')
parser.add_argument('--analyses', nargs='+',
choices=['clustering', 'pca', 'tsne', 'umap', 'factor',
'correlation', 'network', 'importance', 'analyst', 'all'],
default=['all'],
help='Which analyses to run (default: all)')
args = parser.parse_args()
# Check if file exists
if not Path(args.csv_file).exists():
print(f"Error: File not found: {args.csv_file}")
sys.exit(1)
# Derive output dir from dataset dir if not specified
output_dir = args.output if args.output else str(Path(args.csv_file).parent / 'analysis')
# Initialize analyzer
print("=" * 80)
print("PROTOCOL BICORDER - MULTIVARIATE ANALYSIS")
print("=" * 80)
analyzer = ProtocolAnalyzer(args.csv_file, output_dir, min_coverage=args.min_coverage)
# Determine which analyses to run
run_all = 'all' in args.analyses
# Run analyses
try:
# Clustering
if run_all or 'clustering' in args.analyses:
analyzer.kmeans_clustering()
analyzer.hierarchical_clustering()
analyzer.dbscan_clustering()
# Dimensionality reduction
if run_all or 'pca' in args.analyses:
analyzer.pca_analysis()
if run_all or 'tsne' in args.analyses:
analyzer.tsne_analysis()
if run_all or 'umap' in args.analyses:
analyzer.umap_analysis()
if run_all or 'factor' in args.analyses:
analyzer.factor_analysis()
# Correlation and structure
if run_all or 'correlation' in args.analyses:
analyzer.correlation_analysis()
if run_all or 'network' in args.analyses:
analyzer.network_analysis(threshold=0.6)
# Classification
if run_all or 'importance' in args.analyses:
analyzer.category_discriminant_analysis()
analyzer.feature_importance_analysis()
if run_all or 'analyst' in args.analyses:
analyzer.analyst_comparison()
# Generate summary
analyzer.generate_summary_report()
print("\n" + "=" * 80)
print("ANALYSIS COMPLETE!")
print("=" * 80)
print(f"\nAll results saved to: {analyzer.output_dir}/")
except Exception as e:
print(f"\nError during analysis: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == '__main__':
main()