- Move all scripts to scripts/, web assets to web/, analysis results into self-contained data/readings/<type>_<YYYYMMDD>/ directories - Add data/readings/manual_20260320/ with 32 JSON readings from git.medlab.host/ntnsndr/protocol-bicorder-data - Add scripts/json_to_csv.py to convert bicorder JSON files to CSV - Add scripts/sync_readings.sh for one-command sync + re-analysis of any dataset backed by a .sync_source config file - Add scripts/classify_readings.py to apply the LDA classifier to all readings and save per-reading cluster assignments - Add --min-coverage flag to multivariate_analysis.py for sparse/shortform datasets; also applies in lda_visualization.py - Fix lda_visualization.py NaN handling and 0-d array annotation bug - Update README.md and WORKFLOW.md to document datasets, sync workflow, shortform handling, and new scripts Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
859 lines
33 KiB
Python
859 lines
33 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Multivariate Analysis Script for Protocol Bicorder Data
|
||
|
||
Performs comprehensive multivariate statistical analyses on protocol diagnostic data,
|
||
including clustering, dimensionality reduction, correlation analysis, and visualization.
|
||
|
||
Usage:
|
||
python3 multivariate_analysis.py diagnostic_output.csv [--analyses all]
|
||
python3 multivariate_analysis.py diagnostic_output.csv --analyses clustering pca
|
||
"""
|
||
|
||
import argparse
|
||
import sys
|
||
from pathlib import Path
|
||
import warnings
|
||
warnings.filterwarnings('ignore')
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
import matplotlib.pyplot as plt
|
||
import seaborn as sns
|
||
from scipy import stats
|
||
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
|
||
from scipy.spatial.distance import pdist, squareform
|
||
import networkx as nx
|
||
|
||
from sklearn.preprocessing import StandardScaler
|
||
from sklearn.decomposition import PCA, FactorAnalysis
|
||
from sklearn.manifold import TSNE
|
||
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
|
||
from sklearn.ensemble import RandomForestClassifier
|
||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||
from sklearn.metrics import silhouette_score, davies_bouldin_score
|
||
|
||
try:
|
||
import umap
|
||
UMAP_AVAILABLE = True
|
||
except ImportError:
|
||
UMAP_AVAILABLE = False
|
||
print("Note: UMAP not available. Install with: pip install umap-learn")
|
||
|
||
|
||
class ProtocolAnalyzer:
|
||
"""Main class for multivariate analysis of protocol data."""
|
||
|
||
def __init__(self, csv_path, output_dir='analysis_results', min_coverage=0.0):
|
||
"""Initialize analyzer with data and output directory.
|
||
|
||
Args:
|
||
csv_path: Path to diagnostic CSV file
|
||
output_dir: Directory for analysis output
|
||
min_coverage: Drop dimension columns with fewer than this fraction of
|
||
non-null values (0.0–1.0). Useful for sparse/shortform
|
||
datasets. E.g. 0.8 keeps only columns with ≥80% coverage.
|
||
"""
|
||
self.csv_path = Path(csv_path)
|
||
self.output_dir = Path(output_dir)
|
||
self.min_coverage = min_coverage
|
||
self.output_dir.mkdir(exist_ok=True)
|
||
|
||
# Create subdirectories
|
||
(self.output_dir / 'plots').mkdir(exist_ok=True)
|
||
(self.output_dir / 'data').mkdir(exist_ok=True)
|
||
(self.output_dir / 'reports').mkdir(exist_ok=True)
|
||
|
||
# Load and prepare data
|
||
self.df = None
|
||
self.dimension_cols = []
|
||
self.design_cols = []
|
||
self.entanglement_cols = []
|
||
self.experience_cols = []
|
||
self.scaled_data = None
|
||
self.scaler = None
|
||
|
||
self._load_data()
|
||
|
||
def _load_data(self):
|
||
"""Load CSV and identify dimension columns."""
|
||
print(f"Loading data from {self.csv_path}...")
|
||
self.df = pd.read_csv(self.csv_path)
|
||
|
||
# Identify dimension columns
|
||
all_cols = self.df.columns.tolist()
|
||
self.design_cols = [c for c in all_cols if c.startswith('Design_')]
|
||
self.entanglement_cols = [c for c in all_cols if c.startswith('Entanglement_')]
|
||
self.experience_cols = [c for c in all_cols if c.startswith('Experience_')]
|
||
self.dimension_cols = self.design_cols + self.entanglement_cols + self.experience_cols
|
||
|
||
print(f"Loaded {len(self.df)} protocols with {len(self.dimension_cols)} dimensions")
|
||
print(f" - Design: {len(self.design_cols)}")
|
||
print(f" - Entanglement: {len(self.entanglement_cols)}")
|
||
print(f" - Experience: {len(self.experience_cols)}")
|
||
|
||
# Drop low-coverage columns if min_coverage is set
|
||
if self.min_coverage > 0.0:
|
||
n = len(self.df)
|
||
coverage = self.df[self.dimension_cols].notna().sum() / n
|
||
dropped = [c for c in self.dimension_cols if coverage[c] < self.min_coverage]
|
||
if dropped:
|
||
print(f"\nDropping {len(dropped)} dimension(s) below {self.min_coverage:.0%} coverage:")
|
||
for c in dropped:
|
||
print(f" - {c}: {coverage[c]:.0%}")
|
||
self.dimension_cols = [c for c in self.dimension_cols if c not in dropped]
|
||
self.design_cols = [c for c in self.design_cols if c not in dropped]
|
||
self.entanglement_cols = [c for c in self.entanglement_cols if c not in dropped]
|
||
self.experience_cols = [c for c in self.experience_cols if c not in dropped]
|
||
print(f"Remaining dimensions: {len(self.dimension_cols)}")
|
||
|
||
# Check for missing values
|
||
missing_count = self.df[self.dimension_cols].isna().sum().sum()
|
||
rows_with_missing = self.df[self.dimension_cols].isna().any(axis=1).sum()
|
||
|
||
if missing_count > 0:
|
||
print(f"\nWarning: Found {missing_count} missing values in {rows_with_missing} rows")
|
||
print("Dropping rows with missing values...")
|
||
self.df = self.df.dropna(subset=self.dimension_cols)
|
||
print(f"Dataset now contains {len(self.df)} protocols")
|
||
|
||
# Standardize the dimension data
|
||
self.scaler = StandardScaler()
|
||
self.scaled_data = self.scaler.fit_transform(self.df[self.dimension_cols])
|
||
|
||
def save_results(self, data, filename, subdir='data'):
|
||
"""Save results to CSV file."""
|
||
output_path = self.output_dir / subdir / filename
|
||
if isinstance(data, pd.DataFrame):
|
||
data.to_csv(output_path, index=False)
|
||
else:
|
||
pd.DataFrame(data).to_csv(output_path)
|
||
print(f" Saved: {output_path}")
|
||
|
||
def save_plot(self, filename):
|
||
"""Save current matplotlib figure."""
|
||
output_path = self.output_dir / 'plots' / filename
|
||
plt.tight_layout()
|
||
plt.savefig(output_path, dpi=300, bbox_inches='tight')
|
||
print(f" Saved: {output_path}")
|
||
plt.close()
|
||
|
||
# ========== CLUSTERING ANALYSES ==========
|
||
|
||
def kmeans_clustering(self, n_clusters_range=(2, 10)):
|
||
"""Perform K-means clustering with elbow method."""
|
||
print("\n=== K-Means Clustering ===")
|
||
|
||
# Elbow method
|
||
inertias = []
|
||
silhouettes = []
|
||
k_range = range(n_clusters_range[0], n_clusters_range[1] + 1)
|
||
|
||
for k in k_range:
|
||
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
|
||
labels = kmeans.fit_predict(self.scaled_data)
|
||
inertias.append(kmeans.inertia_)
|
||
if k > 1:
|
||
silhouettes.append(silhouette_score(self.scaled_data, labels))
|
||
else:
|
||
silhouettes.append(0)
|
||
|
||
# Plot elbow curve
|
||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
|
||
|
||
ax1.plot(k_range, inertias, 'bo-')
|
||
ax1.set_xlabel('Number of Clusters (k)')
|
||
ax1.set_ylabel('Inertia')
|
||
ax1.set_title('Elbow Method for Optimal k')
|
||
ax1.grid(True, alpha=0.3)
|
||
|
||
ax2.plot(k_range, silhouettes, 'ro-')
|
||
ax2.set_xlabel('Number of Clusters (k)')
|
||
ax2.set_ylabel('Silhouette Score')
|
||
ax2.set_title('Silhouette Score by k')
|
||
ax2.grid(True, alpha=0.3)
|
||
|
||
self.save_plot('kmeans_elbow.png')
|
||
|
||
# Use optimal k (highest silhouette)
|
||
optimal_k = k_range[np.argmax(silhouettes)]
|
||
print(f"Optimal k by silhouette score: {optimal_k}")
|
||
|
||
# Final clustering
|
||
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
|
||
self.df['kmeans_cluster'] = kmeans.fit_predict(self.scaled_data)
|
||
|
||
# Save results
|
||
results = self.df[['Descriptor', 'kmeans_cluster']].copy()
|
||
results['cluster'] = results['kmeans_cluster'] + 1 # 1-indexed for readability
|
||
self.save_results(results[['Descriptor', 'cluster']], 'kmeans_clusters.csv')
|
||
|
||
# Cluster statistics
|
||
print(f"\nCluster sizes:")
|
||
print(self.df['kmeans_cluster'].value_counts().sort_index())
|
||
|
||
return self.df['kmeans_cluster']
|
||
|
||
def hierarchical_clustering(self, n_clusters=5, method='ward'):
|
||
"""Perform hierarchical clustering with dendrogram."""
|
||
print("\n=== Hierarchical Clustering ===")
|
||
|
||
# Compute linkage
|
||
Z = linkage(self.scaled_data, method=method)
|
||
|
||
# Plot dendrogram
|
||
plt.figure(figsize=(16, 8))
|
||
dendrogram(Z, labels=self.df['Descriptor'].values, leaf_font_size=8)
|
||
plt.title(f'Hierarchical Clustering Dendrogram ({method} linkage)')
|
||
plt.xlabel('Protocol')
|
||
plt.ylabel('Distance')
|
||
plt.xticks(rotation=90)
|
||
self.save_plot('hierarchical_dendrogram.png')
|
||
|
||
# Cut tree to get clusters
|
||
self.df['hierarchical_cluster'] = fcluster(Z, n_clusters, criterion='maxclust')
|
||
|
||
# Save results
|
||
results = self.df[['Descriptor', 'hierarchical_cluster']].copy()
|
||
results.columns = ['Descriptor', 'cluster']
|
||
self.save_results(results, 'hierarchical_clusters.csv')
|
||
|
||
print(f"\nCluster sizes:")
|
||
print(self.df['hierarchical_cluster'].value_counts().sort_index())
|
||
|
||
return self.df['hierarchical_cluster']
|
||
|
||
def dbscan_clustering(self, eps=3.0, min_samples=3):
|
||
"""Perform DBSCAN clustering to identify outliers."""
|
||
print("\n=== DBSCAN Clustering ===")
|
||
|
||
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
|
||
self.df['dbscan_cluster'] = dbscan.fit_predict(self.scaled_data)
|
||
|
||
n_clusters = len(set(self.df['dbscan_cluster'])) - (1 if -1 in self.df['dbscan_cluster'] else 0)
|
||
n_outliers = (self.df['dbscan_cluster'] == -1).sum()
|
||
|
||
print(f"Found {n_clusters} clusters and {n_outliers} outliers")
|
||
|
||
# Save results
|
||
results = self.df[['Descriptor', 'dbscan_cluster']].copy()
|
||
results.columns = ['Descriptor', 'cluster']
|
||
self.save_results(results, 'dbscan_clusters.csv')
|
||
|
||
if n_outliers > 0:
|
||
outliers = self.df[self.df['dbscan_cluster'] == -1][['Descriptor']]
|
||
self.save_results(outliers, 'dbscan_outliers.csv')
|
||
print("\nOutlier protocols:")
|
||
for protocol in outliers['Descriptor']:
|
||
print(f" - {protocol}")
|
||
|
||
return self.df['dbscan_cluster']
|
||
|
||
# ========== DIMENSIONALITY REDUCTION ==========
|
||
|
||
def pca_analysis(self, n_components=None):
|
||
"""Perform PCA and visualize results."""
|
||
print("\n=== Principal Component Analysis ===")
|
||
|
||
# Fit PCA
|
||
if n_components is None:
|
||
pca = PCA()
|
||
else:
|
||
pca = PCA(n_components=n_components)
|
||
|
||
pca_coords = pca.fit_transform(self.scaled_data)
|
||
|
||
# Explained variance
|
||
explained_var = pca.explained_variance_ratio_
|
||
cumsum_var = np.cumsum(explained_var)
|
||
|
||
print(f"First 5 PCs explain {cumsum_var[4]*100:.1f}% of variance")
|
||
|
||
# Plot explained variance
|
||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
|
||
|
||
n_show = min(15, len(explained_var))
|
||
ax1.bar(range(1, n_show + 1), explained_var[:n_show])
|
||
ax1.set_xlabel('Principal Component')
|
||
ax1.set_ylabel('Explained Variance Ratio')
|
||
ax1.set_title('Variance Explained by Each PC')
|
||
ax1.grid(True, alpha=0.3, axis='y')
|
||
|
||
ax2.plot(range(1, n_show + 1), cumsum_var[:n_show], 'o-')
|
||
ax2.axhline(y=0.8, color='r', linestyle='--', alpha=0.5, label='80% threshold')
|
||
ax2.set_xlabel('Number of Components')
|
||
ax2.set_ylabel('Cumulative Explained Variance')
|
||
ax2.set_title('Cumulative Variance Explained')
|
||
ax2.legend()
|
||
ax2.grid(True, alpha=0.3)
|
||
|
||
self.save_plot('pca_variance.png')
|
||
|
||
# 2D visualization
|
||
plt.figure(figsize=(12, 10))
|
||
plt.scatter(pca_coords[:, 0], pca_coords[:, 1], alpha=0.6, s=50)
|
||
|
||
# Annotate points
|
||
for i, protocol in enumerate(self.df['Descriptor']):
|
||
if i % 3 == 0: # Label every 3rd point to avoid clutter
|
||
plt.annotate(protocol, (pca_coords[i, 0], pca_coords[i, 1]),
|
||
fontsize=6, alpha=0.7)
|
||
|
||
plt.xlabel(f'PC1 ({explained_var[0]*100:.1f}% variance)')
|
||
plt.ylabel(f'PC2 ({explained_var[1]*100:.1f}% variance)')
|
||
plt.title('Protocols in PCA Space (First 2 Components)')
|
||
plt.grid(True, alpha=0.3)
|
||
self.save_plot('pca_2d.png')
|
||
|
||
# Save PCA coordinates
|
||
pca_df = pd.DataFrame(pca_coords[:, :5],
|
||
columns=[f'PC{i+1}' for i in range(min(5, pca_coords.shape[1]))])
|
||
pca_df.insert(0, 'Descriptor', self.df['Descriptor'])
|
||
self.save_results(pca_df, 'pca_coordinates.csv')
|
||
|
||
# Component loadings
|
||
loadings = pd.DataFrame(
|
||
pca.components_[:5, :].T,
|
||
columns=[f'PC{i+1}' for i in range(min(5, pca.components_.shape[0]))],
|
||
index=self.dimension_cols
|
||
)
|
||
self.save_results(loadings, 'pca_loadings.csv')
|
||
|
||
# Plot loadings heatmap
|
||
plt.figure(figsize=(10, 12))
|
||
sns.heatmap(loadings, cmap='RdBu_r', center=0, cbar_kws={'label': 'Loading'})
|
||
plt.title('PCA Component Loadings')
|
||
plt.tight_layout()
|
||
self.save_plot('pca_loadings_heatmap.png')
|
||
|
||
return pca_coords, pca
|
||
|
||
def tsne_analysis(self, perplexity=30, n_components=2):
|
||
"""Perform t-SNE analysis."""
|
||
print("\n=== t-SNE Analysis ===")
|
||
|
||
tsne = TSNE(n_components=n_components, perplexity=perplexity, random_state=42, max_iter=1000)
|
||
tsne_coords = tsne.fit_transform(self.scaled_data)
|
||
|
||
# Plot
|
||
plt.figure(figsize=(12, 10))
|
||
plt.scatter(tsne_coords[:, 0], tsne_coords[:, 1], alpha=0.6, s=50)
|
||
|
||
# Annotate some points
|
||
for i, protocol in enumerate(self.df['Descriptor']):
|
||
if i % 4 == 0: # Label every 4th point
|
||
plt.annotate(protocol, (tsne_coords[i, 0], tsne_coords[i, 1]),
|
||
fontsize=6, alpha=0.7)
|
||
|
||
plt.xlabel('t-SNE Dimension 1')
|
||
plt.ylabel('t-SNE Dimension 2')
|
||
plt.title(f't-SNE Projection (perplexity={perplexity})')
|
||
plt.grid(True, alpha=0.3)
|
||
self.save_plot('tsne_2d.png')
|
||
|
||
# Save coordinates
|
||
tsne_df = pd.DataFrame(tsne_coords, columns=['TSNE1', 'TSNE2'])
|
||
tsne_df.insert(0, 'Descriptor', self.df['Descriptor'])
|
||
self.save_results(tsne_df, 'tsne_coordinates.csv')
|
||
|
||
return tsne_coords
|
||
|
||
def umap_analysis(self, n_neighbors=15, min_dist=0.1, n_components=2):
|
||
"""Perform UMAP analysis if available."""
|
||
if not UMAP_AVAILABLE:
|
||
print("\n=== UMAP Analysis ===")
|
||
print("UMAP not available. Install with: pip install umap-learn")
|
||
return None
|
||
|
||
print("\n=== UMAP Analysis ===")
|
||
|
||
reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist,
|
||
n_components=n_components, random_state=42)
|
||
umap_coords = reducer.fit_transform(self.scaled_data)
|
||
|
||
# Plot
|
||
plt.figure(figsize=(12, 10))
|
||
plt.scatter(umap_coords[:, 0], umap_coords[:, 1], alpha=0.6, s=50)
|
||
|
||
# Annotate some points
|
||
for i, protocol in enumerate(self.df['Descriptor']):
|
||
if i % 4 == 0:
|
||
plt.annotate(protocol, (umap_coords[i, 0], umap_coords[i, 1]),
|
||
fontsize=6, alpha=0.7)
|
||
|
||
plt.xlabel('UMAP Dimension 1')
|
||
plt.ylabel('UMAP Dimension 2')
|
||
plt.title(f'UMAP Projection (n_neighbors={n_neighbors}, min_dist={min_dist})')
|
||
plt.grid(True, alpha=0.3)
|
||
self.save_plot('umap_2d.png')
|
||
|
||
# Save coordinates
|
||
umap_df = pd.DataFrame(umap_coords, columns=['UMAP1', 'UMAP2'])
|
||
umap_df.insert(0, 'Descriptor', self.df['Descriptor'])
|
||
self.save_results(umap_df, 'umap_coordinates.csv')
|
||
|
||
return umap_coords
|
||
|
||
def factor_analysis(self, n_factors=5):
|
||
"""Perform factor analysis."""
|
||
print("\n=== Factor Analysis ===")
|
||
|
||
fa = FactorAnalysis(n_components=n_factors, random_state=42)
|
||
fa_coords = fa.fit_transform(self.scaled_data)
|
||
|
||
# Factor loadings
|
||
loadings = pd.DataFrame(
|
||
fa.components_.T,
|
||
columns=[f'Factor{i+1}' for i in range(n_factors)],
|
||
index=self.dimension_cols
|
||
)
|
||
self.save_results(loadings, 'factor_loadings.csv')
|
||
|
||
# Plot loadings heatmap
|
||
plt.figure(figsize=(10, 12))
|
||
sns.heatmap(loadings, cmap='RdBu_r', center=0, cbar_kws={'label': 'Loading'})
|
||
plt.title('Factor Analysis Loadings')
|
||
plt.tight_layout()
|
||
self.save_plot('factor_loadings_heatmap.png')
|
||
|
||
# Save factor scores
|
||
fa_df = pd.DataFrame(fa_coords,
|
||
columns=[f'Factor{i+1}' for i in range(n_factors)])
|
||
fa_df.insert(0, 'Descriptor', self.df['Descriptor'])
|
||
self.save_results(fa_df, 'factor_scores.csv')
|
||
|
||
return fa_coords, fa
|
||
|
||
# ========== CORRELATION & STRUCTURE ==========
|
||
|
||
def correlation_analysis(self):
|
||
"""Compute and visualize correlation matrices."""
|
||
print("\n=== Correlation Analysis ===")
|
||
|
||
# Full correlation matrix
|
||
corr_matrix = self.df[self.dimension_cols].corr()
|
||
|
||
# Plot full correlation heatmap
|
||
plt.figure(figsize=(16, 14))
|
||
sns.heatmap(corr_matrix, cmap='RdBu_r', center=0,
|
||
square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'})
|
||
plt.title('Correlation Matrix - All Dimensions')
|
||
plt.tight_layout()
|
||
self.save_plot('correlation_heatmap_full.png')
|
||
|
||
# Save correlation matrix
|
||
self.save_results(corr_matrix, 'correlation_matrix.csv')
|
||
|
||
# Find strongest correlations
|
||
corr_pairs = []
|
||
for i in range(len(corr_matrix.columns)):
|
||
for j in range(i+1, len(corr_matrix.columns)):
|
||
corr_pairs.append({
|
||
'Dimension1': corr_matrix.columns[i],
|
||
'Dimension2': corr_matrix.columns[j],
|
||
'Correlation': corr_matrix.iloc[i, j]
|
||
})
|
||
|
||
corr_df = pd.DataFrame(corr_pairs).sort_values('Correlation',
|
||
key=abs,
|
||
ascending=False)
|
||
self.save_results(corr_df.head(20), 'top_correlations.csv')
|
||
|
||
print("\nTop 5 positive correlations:")
|
||
for _, row in corr_df.head(5).iterrows():
|
||
print(f" {row['Dimension1']} <-> {row['Dimension2']}: {row['Correlation']:.3f}")
|
||
|
||
print("\nTop 5 negative correlations:")
|
||
for _, row in corr_df.tail(5).iterrows():
|
||
print(f" {row['Dimension1']} <-> {row['Dimension2']}: {row['Correlation']:.3f}")
|
||
|
||
# Within-category correlations
|
||
self._plot_category_correlation('Design', self.design_cols)
|
||
self._plot_category_correlation('Entanglement', self.entanglement_cols)
|
||
self._plot_category_correlation('Experience', self.experience_cols)
|
||
|
||
return corr_matrix
|
||
|
||
def _plot_category_correlation(self, category_name, columns):
|
||
"""Plot correlation heatmap for a specific category."""
|
||
corr = self.df[columns].corr()
|
||
|
||
plt.figure(figsize=(10, 8))
|
||
sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r', center=0,
|
||
square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'})
|
||
plt.title(f'{category_name} Dimensions - Correlation Matrix')
|
||
plt.tight_layout()
|
||
self.save_plot(f'correlation_heatmap_{category_name.lower()}.png')
|
||
|
||
def network_analysis(self, threshold=0.5):
|
||
"""Create network graph of protocol similarities."""
|
||
print("\n=== Network Analysis ===")
|
||
|
||
# Compute pairwise distances
|
||
distances = pdist(self.scaled_data, metric='euclidean')
|
||
dist_matrix = squareform(distances)
|
||
|
||
# Convert to similarity (inverse of distance, normalized)
|
||
max_dist = dist_matrix.max()
|
||
similarity_matrix = 1 - (dist_matrix / max_dist)
|
||
|
||
# Create network
|
||
G = nx.Graph()
|
||
|
||
# Add nodes
|
||
for i, protocol in enumerate(self.df['Descriptor']):
|
||
G.add_node(i, label=protocol)
|
||
|
||
# Add edges above threshold
|
||
edge_count = 0
|
||
for i in range(len(similarity_matrix)):
|
||
for j in range(i+1, len(similarity_matrix)):
|
||
if similarity_matrix[i, j] > threshold:
|
||
G.add_edge(i, j, weight=similarity_matrix[i, j])
|
||
edge_count += 1
|
||
|
||
print(f"Network with {G.number_of_nodes()} nodes and {edge_count} edges")
|
||
|
||
# Calculate network metrics
|
||
if G.number_of_edges() > 0:
|
||
degree_centrality = nx.degree_centrality(G)
|
||
betweenness = nx.betweenness_centrality(G)
|
||
|
||
metrics_df = pd.DataFrame({
|
||
'Descriptor': [self.df.iloc[i]['Descriptor'] for i in G.nodes()],
|
||
'Degree_Centrality': [degree_centrality[i] for i in G.nodes()],
|
||
'Betweenness_Centrality': [betweenness[i] for i in G.nodes()]
|
||
}).sort_values('Degree_Centrality', ascending=False)
|
||
|
||
self.save_results(metrics_df, 'network_metrics.csv')
|
||
|
||
print("\nTop 5 most central protocols:")
|
||
for _, row in metrics_df.head(5).iterrows():
|
||
print(f" {row['Descriptor']}: {row['Degree_Centrality']:.3f}")
|
||
|
||
# Plot network
|
||
plt.figure(figsize=(16, 16))
|
||
pos = nx.spring_layout(G, k=0.5, iterations=50, seed=42)
|
||
|
||
# Node sizes based on degree centrality
|
||
node_sizes = [degree_centrality[i] * 3000 + 100 for i in G.nodes()]
|
||
|
||
nx.draw_networkx_nodes(G, pos, node_size=node_sizes,
|
||
node_color='lightblue', alpha=0.7)
|
||
nx.draw_networkx_edges(G, pos, alpha=0.2)
|
||
|
||
# Labels for high-centrality nodes
|
||
high_centrality = {i: self.df.iloc[i]['Descriptor']
|
||
for i in G.nodes() if degree_centrality[i] > 0.1}
|
||
nx.draw_networkx_labels(G, pos, labels=high_centrality, font_size=8)
|
||
|
||
plt.title(f'Protocol Similarity Network (threshold={threshold})')
|
||
plt.axis('off')
|
||
plt.tight_layout()
|
||
self.save_plot('network_graph.png')
|
||
else:
|
||
print("No edges above threshold - try lowering the threshold")
|
||
|
||
return G
|
||
|
||
# ========== CLASSIFICATION & PREDICTION ==========
|
||
|
||
def category_discriminant_analysis(self):
|
||
"""Analyze how well dimension categories discriminate protocols."""
|
||
print("\n=== Category Discriminant Analysis ===")
|
||
|
||
results = []
|
||
|
||
for category_name, columns in [('Design', self.design_cols),
|
||
('Entanglement', self.entanglement_cols),
|
||
('Experience', self.experience_cols)]:
|
||
|
||
# Use one category to predict clustering from another
|
||
X = self.df[columns].values
|
||
|
||
# Use kmeans clusters as target if available
|
||
if 'kmeans_cluster' in self.df.columns:
|
||
y = self.df['kmeans_cluster'].values
|
||
|
||
# LDA
|
||
try:
|
||
lda = LinearDiscriminantAnalysis()
|
||
lda.fit(X, y)
|
||
score = lda.score(X, y)
|
||
|
||
results.append({
|
||
'Category': category_name,
|
||
'Accuracy': score,
|
||
'N_Dimensions': len(columns)
|
||
})
|
||
|
||
print(f"{category_name} dimensions predict clusters with {score*100:.1f}% accuracy")
|
||
except:
|
||
print(f"Could not perform LDA for {category_name}")
|
||
|
||
if results:
|
||
results_df = pd.DataFrame(results)
|
||
self.save_results(results_df, 'category_discriminant_results.csv')
|
||
|
||
return results
|
||
|
||
def feature_importance_analysis(self):
|
||
"""Analyze which dimensions are most important for clustering."""
|
||
print("\n=== Feature Importance Analysis ===")
|
||
|
||
if 'kmeans_cluster' not in self.df.columns:
|
||
print("Run clustering first to enable feature importance analysis")
|
||
return None
|
||
|
||
# Random Forest classifier
|
||
X = self.df[self.dimension_cols].values
|
||
y = self.df['kmeans_cluster'].values
|
||
|
||
rf = RandomForestClassifier(n_estimators=100, random_state=42)
|
||
rf.fit(X, y)
|
||
|
||
# Feature importances
|
||
importances = pd.DataFrame({
|
||
'Dimension': self.dimension_cols,
|
||
'Importance': rf.feature_importances_
|
||
}).sort_values('Importance', ascending=False)
|
||
|
||
self.save_results(importances, 'feature_importances.csv')
|
||
|
||
# Plot top 20
|
||
plt.figure(figsize=(10, 12))
|
||
top_20 = importances.head(20)
|
||
plt.barh(range(len(top_20)), top_20['Importance'])
|
||
plt.yticks(range(len(top_20)), top_20['Dimension'])
|
||
plt.xlabel('Importance')
|
||
plt.title('Top 20 Most Important Dimensions for Clustering')
|
||
plt.gca().invert_yaxis()
|
||
plt.tight_layout()
|
||
self.save_plot('feature_importances.png')
|
||
|
||
print("\nTop 10 most important dimensions:")
|
||
for _, row in importances.head(10).iterrows():
|
||
print(f" {row['Dimension']}: {row['Importance']:.4f}")
|
||
|
||
return importances
|
||
|
||
def analyst_comparison(self):
|
||
"""Compare ratings across different analysts."""
|
||
print("\n=== Analyst Comparison ===")
|
||
|
||
if 'analyst' not in self.df.columns:
|
||
print("No analyst column found")
|
||
return None
|
||
|
||
analysts = self.df['analyst'].unique()
|
||
print(f"Found {len(analysts)} unique analysts")
|
||
|
||
# Mean ratings by analyst for each dimension
|
||
analyst_means = self.df.groupby('analyst')[self.dimension_cols].mean()
|
||
self.save_results(analyst_means, 'analyst_mean_ratings.csv')
|
||
|
||
# Plot comparison
|
||
fig, axes = plt.subplots(3, 1, figsize=(14, 12))
|
||
|
||
for idx, (category_name, columns) in enumerate([
|
||
('Design', self.design_cols),
|
||
('Entanglement', self.entanglement_cols),
|
||
('Experience', self.experience_cols)
|
||
]):
|
||
analyst_means[columns].T.plot(ax=axes[idx], marker='o')
|
||
axes[idx].set_title(f'{category_name} Dimensions - Mean Ratings by Analyst')
|
||
axes[idx].set_ylabel('Mean Rating')
|
||
axes[idx].legend(title='Analyst', bbox_to_anchor=(1.05, 1), loc='upper left')
|
||
axes[idx].grid(True, alpha=0.3)
|
||
axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=45, ha='right')
|
||
|
||
plt.tight_layout()
|
||
self.save_plot('analyst_comparison.png')
|
||
|
||
return analyst_means
|
||
|
||
# ========== SUMMARY REPORT ==========
|
||
|
||
def generate_summary_report(self):
|
||
"""Generate a text summary of all analyses."""
|
||
print("\n=== Generating Summary Report ===")
|
||
|
||
report_lines = []
|
||
report_lines.append("=" * 80)
|
||
report_lines.append("MULTIVARIATE ANALYSIS SUMMARY REPORT")
|
||
report_lines.append("Protocol Bicorder Dataset")
|
||
report_lines.append("=" * 80)
|
||
report_lines.append("")
|
||
|
||
report_lines.append(f"Dataset: {self.csv_path}")
|
||
report_lines.append(f"Number of protocols: {len(self.df)}")
|
||
report_lines.append(f"Number of dimensions: {len(self.dimension_cols)}")
|
||
report_lines.append(f" - Design: {len(self.design_cols)}")
|
||
report_lines.append(f" - Entanglement: {len(self.entanglement_cols)}")
|
||
report_lines.append(f" - Experience: {len(self.experience_cols)}")
|
||
report_lines.append("")
|
||
|
||
report_lines.append("-" * 80)
|
||
report_lines.append("ANALYSES PERFORMED")
|
||
report_lines.append("-" * 80)
|
||
|
||
# Check which analyses were run
|
||
analyses_run = []
|
||
|
||
if 'kmeans_cluster' in self.df.columns:
|
||
analyses_run.append("- K-Means Clustering")
|
||
report_lines.append(f"K-Means: {len(self.df['kmeans_cluster'].unique())} clusters identified")
|
||
|
||
if 'hierarchical_cluster' in self.df.columns:
|
||
analyses_run.append("- Hierarchical Clustering")
|
||
report_lines.append(f"Hierarchical: {len(self.df['hierarchical_cluster'].unique())} clusters")
|
||
|
||
if 'dbscan_cluster' in self.df.columns:
|
||
analyses_run.append("- DBSCAN Clustering")
|
||
n_outliers = (self.df['dbscan_cluster'] == -1).sum()
|
||
report_lines.append(f"DBSCAN: {n_outliers} outlier protocols identified")
|
||
|
||
report_lines.append("")
|
||
report_lines.append("Dimensionality Reduction:")
|
||
report_lines.append("- Principal Component Analysis (PCA)")
|
||
report_lines.append("- t-SNE Projection")
|
||
if UMAP_AVAILABLE:
|
||
report_lines.append("- UMAP Projection")
|
||
report_lines.append("- Factor Analysis")
|
||
report_lines.append("")
|
||
|
||
report_lines.append("Statistical Analyses:")
|
||
report_lines.append("- Correlation Analysis")
|
||
report_lines.append("- Network Analysis")
|
||
report_lines.append("- Feature Importance Analysis")
|
||
|
||
if 'analyst' in self.df.columns:
|
||
report_lines.append("- Analyst Comparison")
|
||
|
||
report_lines.append("")
|
||
report_lines.append("-" * 80)
|
||
report_lines.append("OUTPUT FILES")
|
||
report_lines.append("-" * 80)
|
||
report_lines.append(f"All results saved to: {self.output_dir}/")
|
||
report_lines.append(" - plots/ : All visualizations (PNG)")
|
||
report_lines.append(" - data/ : All numerical results (CSV)")
|
||
report_lines.append(" - reports/ : This summary report")
|
||
report_lines.append("")
|
||
|
||
report_lines.append("=" * 80)
|
||
report_lines.append("END OF REPORT")
|
||
report_lines.append("=" * 80)
|
||
|
||
report_text = "\n".join(report_lines)
|
||
|
||
# Save report
|
||
report_path = self.output_dir / 'reports' / 'analysis_summary.txt'
|
||
with open(report_path, 'w') as f:
|
||
f.write(report_text)
|
||
|
||
print(f" Saved: {report_path}")
|
||
print("\n" + report_text)
|
||
|
||
return report_text
|
||
|
||
|
||
def main():
|
||
"""Main execution function."""
|
||
parser = argparse.ArgumentParser(
|
||
description='Multivariate analysis of Protocol Bicorder data',
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
epilog="""
|
||
Examples:
|
||
python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv
|
||
python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv --output data/readings/synthetic_20251116/analysis
|
||
python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv --analyses clustering pca
|
||
"""
|
||
)
|
||
|
||
parser.add_argument('csv_file', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
|
||
parser.add_argument('--output', '-o', default=None,
|
||
help='Output directory (default: <dataset_dir>/analysis)')
|
||
parser.add_argument('--min-coverage', type=float, default=0.0,
|
||
help='Drop dimension columns below this coverage fraction (0.0–1.0). '
|
||
'E.g. 0.8 keeps only columns ≥80%% complete. '
|
||
'Useful for sparse/shortform datasets (default: 0.0, keep all)')
|
||
parser.add_argument('--analyses', nargs='+',
|
||
choices=['clustering', 'pca', 'tsne', 'umap', 'factor',
|
||
'correlation', 'network', 'importance', 'analyst', 'all'],
|
||
default=['all'],
|
||
help='Which analyses to run (default: all)')
|
||
|
||
args = parser.parse_args()
|
||
|
||
# Check if file exists
|
||
if not Path(args.csv_file).exists():
|
||
print(f"Error: File not found: {args.csv_file}")
|
||
sys.exit(1)
|
||
|
||
# Derive output dir from dataset dir if not specified
|
||
output_dir = args.output if args.output else str(Path(args.csv_file).parent / 'analysis')
|
||
|
||
# Initialize analyzer
|
||
print("=" * 80)
|
||
print("PROTOCOL BICORDER - MULTIVARIATE ANALYSIS")
|
||
print("=" * 80)
|
||
|
||
analyzer = ProtocolAnalyzer(args.csv_file, output_dir, min_coverage=args.min_coverage)
|
||
|
||
# Determine which analyses to run
|
||
run_all = 'all' in args.analyses
|
||
|
||
# Run analyses
|
||
try:
|
||
# Clustering
|
||
if run_all or 'clustering' in args.analyses:
|
||
analyzer.kmeans_clustering()
|
||
analyzer.hierarchical_clustering()
|
||
analyzer.dbscan_clustering()
|
||
|
||
# Dimensionality reduction
|
||
if run_all or 'pca' in args.analyses:
|
||
analyzer.pca_analysis()
|
||
|
||
if run_all or 'tsne' in args.analyses:
|
||
analyzer.tsne_analysis()
|
||
|
||
if run_all or 'umap' in args.analyses:
|
||
analyzer.umap_analysis()
|
||
|
||
if run_all or 'factor' in args.analyses:
|
||
analyzer.factor_analysis()
|
||
|
||
# Correlation and structure
|
||
if run_all or 'correlation' in args.analyses:
|
||
analyzer.correlation_analysis()
|
||
|
||
if run_all or 'network' in args.analyses:
|
||
analyzer.network_analysis(threshold=0.6)
|
||
|
||
# Classification
|
||
if run_all or 'importance' in args.analyses:
|
||
analyzer.category_discriminant_analysis()
|
||
analyzer.feature_importance_analysis()
|
||
|
||
if run_all or 'analyst' in args.analyses:
|
||
analyzer.analyst_comparison()
|
||
|
||
# Generate summary
|
||
analyzer.generate_summary_report()
|
||
|
||
print("\n" + "=" * 80)
|
||
print("ANALYSIS COMPLETE!")
|
||
print("=" * 80)
|
||
print(f"\nAll results saved to: {analyzer.output_dir}/")
|
||
|
||
except Exception as e:
|
||
print(f"\nError during analysis: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
sys.exit(1)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|