Reorganize directory, add manual dataset and sync tooling

- Move all scripts to scripts/, web assets to web/, analysis results into self-contained data/readings/<type>_<YYYYMMDD>/ directories - Add data/readings/manual_20260320/ with 32 JSON readings from git.medlab.host/ntnsndr/protocol-bicorder-data - Add scripts/json_to_csv.py to convert bicorder JSON files to CSV - Add scripts/sync_readings.sh for one-command sync + re-analysis of any dataset backed by a .sync_source config file - Add scripts/classify_readings.py to apply the LDA classifier to all readings and save per-reading cluster assignments - Add --min-coverage flag to multivariate_analysis.py for sparse/shortform datasets; also applies in lda_visualization.py - Fix lda_visualization.py NaN handling and 0-d array annotation bug - Update README.md and WORKFLOW.md to document datasets, sync workflow, shortform handling, and new scripts Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-20 17:35:13 -06:00
parent 0c794dddae
commit 897c30406b
545 changed files with 10715 additions and 718 deletions
@@ -0,0 +1,858 @@
+#!/usr/bin/env python3
+"""
+Multivariate Analysis Script for Protocol Bicorder Data
+
+Performs comprehensive multivariate statistical analyses on protocol diagnostic data,
+including clustering, dimensionality reduction, correlation analysis, and visualization.
+
+Usage:
+    python3 multivariate_analysis.py diagnostic_output.csv [--analyses all]
+    python3 multivariate_analysis.py diagnostic_output.csv --analyses clustering pca
+"""
+
+import argparse
+import sys
+from pathlib import Path
+import warnings
+warnings.filterwarnings('ignore')
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from scipy import stats
+from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
+from scipy.spatial.distance import pdist, squareform
+import networkx as nx
+
+from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA, FactorAnalysis
+from sklearn.manifold import TSNE
+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.metrics import silhouette_score, davies_bouldin_score
+
+try:
+    import umap
+    UMAP_AVAILABLE = True
+except ImportError:
+    UMAP_AVAILABLE = False
+    print("Note: UMAP not available. Install with: pip install umap-learn")
+
+
+class ProtocolAnalyzer:
+    """Main class for multivariate analysis of protocol data."""
+
+    def __init__(self, csv_path, output_dir='analysis_results', min_coverage=0.0):
+        """Initialize analyzer with data and output directory.
+
+        Args:
+            csv_path: Path to diagnostic CSV file
+            output_dir: Directory for analysis output
+            min_coverage: Drop dimension columns with fewer than this fraction of
+                          non-null values (0.0–1.0). Useful for sparse/shortform
+                          datasets. E.g. 0.8 keeps only columns with ≥80% coverage.
+        """
+        self.csv_path = Path(csv_path)
+        self.output_dir = Path(output_dir)
+        self.min_coverage = min_coverage
+        self.output_dir.mkdir(exist_ok=True)
+
+        # Create subdirectories
+        (self.output_dir / 'plots').mkdir(exist_ok=True)
+        (self.output_dir / 'data').mkdir(exist_ok=True)
+        (self.output_dir / 'reports').mkdir(exist_ok=True)
+
+        # Load and prepare data
+        self.df = None
+        self.dimension_cols = []
+        self.design_cols = []
+        self.entanglement_cols = []
+        self.experience_cols = []
+        self.scaled_data = None
+        self.scaler = None
+
+        self._load_data()
+
+    def _load_data(self):
+        """Load CSV and identify dimension columns."""
+        print(f"Loading data from {self.csv_path}...")
+        self.df = pd.read_csv(self.csv_path)
+
+        # Identify dimension columns
+        all_cols = self.df.columns.tolist()
+        self.design_cols = [c for c in all_cols if c.startswith('Design_')]
+        self.entanglement_cols = [c for c in all_cols if c.startswith('Entanglement_')]
+        self.experience_cols = [c for c in all_cols if c.startswith('Experience_')]
+        self.dimension_cols = self.design_cols + self.entanglement_cols + self.experience_cols
+
+        print(f"Loaded {len(self.df)} protocols with {len(self.dimension_cols)} dimensions")
+        print(f"  - Design: {len(self.design_cols)}")
+        print(f"  - Entanglement: {len(self.entanglement_cols)}")
+        print(f"  - Experience: {len(self.experience_cols)}")
+
+        # Drop low-coverage columns if min_coverage is set
+        if self.min_coverage > 0.0:
+            n = len(self.df)
+            coverage = self.df[self.dimension_cols].notna().sum() / n
+            dropped = [c for c in self.dimension_cols if coverage[c] < self.min_coverage]
+            if dropped:
+                print(f"\nDropping {len(dropped)} dimension(s) below {self.min_coverage:.0%} coverage:")
+                for c in dropped:
+                    print(f"  - {c}: {coverage[c]:.0%}")
+                self.dimension_cols = [c for c in self.dimension_cols if c not in dropped]
+                self.design_cols = [c for c in self.design_cols if c not in dropped]
+                self.entanglement_cols = [c for c in self.entanglement_cols if c not in dropped]
+                self.experience_cols = [c for c in self.experience_cols if c not in dropped]
+                print(f"Remaining dimensions: {len(self.dimension_cols)}")
+
+        # Check for missing values
+        missing_count = self.df[self.dimension_cols].isna().sum().sum()
+        rows_with_missing = self.df[self.dimension_cols].isna().any(axis=1).sum()
+
+        if missing_count > 0:
+            print(f"\nWarning: Found {missing_count} missing values in {rows_with_missing} rows")
+            print("Dropping rows with missing values...")
+            self.df = self.df.dropna(subset=self.dimension_cols)
+            print(f"Dataset now contains {len(self.df)} protocols")
+
+        # Standardize the dimension data
+        self.scaler = StandardScaler()
+        self.scaled_data = self.scaler.fit_transform(self.df[self.dimension_cols])
+
+    def save_results(self, data, filename, subdir='data'):
+        """Save results to CSV file."""
+        output_path = self.output_dir / subdir / filename
+        if isinstance(data, pd.DataFrame):
+            data.to_csv(output_path, index=False)
+        else:
+            pd.DataFrame(data).to_csv(output_path)
+        print(f"  Saved: {output_path}")
+
+    def save_plot(self, filename):
+        """Save current matplotlib figure."""
+        output_path = self.output_dir / 'plots' / filename
+        plt.tight_layout()
+        plt.savefig(output_path, dpi=300, bbox_inches='tight')
+        print(f"  Saved: {output_path}")
+        plt.close()
+
+    # ========== CLUSTERING ANALYSES ==========
+
+    def kmeans_clustering(self, n_clusters_range=(2, 10)):
+        """Perform K-means clustering with elbow method."""
+        print("\n=== K-Means Clustering ===")
+
+        # Elbow method
+        inertias = []
+        silhouettes = []
+        k_range = range(n_clusters_range[0], n_clusters_range[1] + 1)
+
+        for k in k_range:
+            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
+            labels = kmeans.fit_predict(self.scaled_data)
+            inertias.append(kmeans.inertia_)
+            if k > 1:
+                silhouettes.append(silhouette_score(self.scaled_data, labels))
+            else:
+                silhouettes.append(0)
+
+        # Plot elbow curve
+        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
+
+        ax1.plot(k_range, inertias, 'bo-')
+        ax1.set_xlabel('Number of Clusters (k)')
+        ax1.set_ylabel('Inertia')
+        ax1.set_title('Elbow Method for Optimal k')
+        ax1.grid(True, alpha=0.3)
+
+        ax2.plot(k_range, silhouettes, 'ro-')
+        ax2.set_xlabel('Number of Clusters (k)')
+        ax2.set_ylabel('Silhouette Score')
+        ax2.set_title('Silhouette Score by k')
+        ax2.grid(True, alpha=0.3)
+
+        self.save_plot('kmeans_elbow.png')
+
+        # Use optimal k (highest silhouette)
+        optimal_k = k_range[np.argmax(silhouettes)]
+        print(f"Optimal k by silhouette score: {optimal_k}")
+
+        # Final clustering
+        kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
+        self.df['kmeans_cluster'] = kmeans.fit_predict(self.scaled_data)
+
+        # Save results
+        results = self.df[['Descriptor', 'kmeans_cluster']].copy()
+        results['cluster'] = results['kmeans_cluster'] + 1  # 1-indexed for readability
+        self.save_results(results[['Descriptor', 'cluster']], 'kmeans_clusters.csv')
+
+        # Cluster statistics
+        print(f"\nCluster sizes:")
+        print(self.df['kmeans_cluster'].value_counts().sort_index())
+
+        return self.df['kmeans_cluster']
+
+    def hierarchical_clustering(self, n_clusters=5, method='ward'):
+        """Perform hierarchical clustering with dendrogram."""
+        print("\n=== Hierarchical Clustering ===")
+
+        # Compute linkage
+        Z = linkage(self.scaled_data, method=method)
+
+        # Plot dendrogram
+        plt.figure(figsize=(16, 8))
+        dendrogram(Z, labels=self.df['Descriptor'].values, leaf_font_size=8)
+        plt.title(f'Hierarchical Clustering Dendrogram ({method} linkage)')
+        plt.xlabel('Protocol')
+        plt.ylabel('Distance')
+        plt.xticks(rotation=90)
+        self.save_plot('hierarchical_dendrogram.png')
+
+        # Cut tree to get clusters
+        self.df['hierarchical_cluster'] = fcluster(Z, n_clusters, criterion='maxclust')
+
+        # Save results
+        results = self.df[['Descriptor', 'hierarchical_cluster']].copy()
+        results.columns = ['Descriptor', 'cluster']
+        self.save_results(results, 'hierarchical_clusters.csv')
+
+        print(f"\nCluster sizes:")
+        print(self.df['hierarchical_cluster'].value_counts().sort_index())
+
+        return self.df['hierarchical_cluster']
+
+    def dbscan_clustering(self, eps=3.0, min_samples=3):
+        """Perform DBSCAN clustering to identify outliers."""
+        print("\n=== DBSCAN Clustering ===")
+
+        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
+        self.df['dbscan_cluster'] = dbscan.fit_predict(self.scaled_data)
+
+        n_clusters = len(set(self.df['dbscan_cluster'])) - (1 if -1 in self.df['dbscan_cluster'] else 0)
+        n_outliers = (self.df['dbscan_cluster'] == -1).sum()
+
+        print(f"Found {n_clusters} clusters and {n_outliers} outliers")
+
+        # Save results
+        results = self.df[['Descriptor', 'dbscan_cluster']].copy()
+        results.columns = ['Descriptor', 'cluster']
+        self.save_results(results, 'dbscan_clusters.csv')
+
+        if n_outliers > 0:
+            outliers = self.df[self.df['dbscan_cluster'] == -1][['Descriptor']]
+            self.save_results(outliers, 'dbscan_outliers.csv')
+            print("\nOutlier protocols:")
+            for protocol in outliers['Descriptor']:
+                print(f"  - {protocol}")
+
+        return self.df['dbscan_cluster']
+
+    # ========== DIMENSIONALITY REDUCTION ==========
+
+    def pca_analysis(self, n_components=None):
+        """Perform PCA and visualize results."""
+        print("\n=== Principal Component Analysis ===")
+
+        # Fit PCA
+        if n_components is None:
+            pca = PCA()
+        else:
+            pca = PCA(n_components=n_components)
+
+        pca_coords = pca.fit_transform(self.scaled_data)
+
+        # Explained variance
+        explained_var = pca.explained_variance_ratio_
+        cumsum_var = np.cumsum(explained_var)
+
+        print(f"First 5 PCs explain {cumsum_var[4]*100:.1f}% of variance")
+
+        # Plot explained variance
+        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
+
+        n_show = min(15, len(explained_var))
+        ax1.bar(range(1, n_show + 1), explained_var[:n_show])
+        ax1.set_xlabel('Principal Component')
+        ax1.set_ylabel('Explained Variance Ratio')
+        ax1.set_title('Variance Explained by Each PC')
+        ax1.grid(True, alpha=0.3, axis='y')
+
+        ax2.plot(range(1, n_show + 1), cumsum_var[:n_show], 'o-')
+        ax2.axhline(y=0.8, color='r', linestyle='--', alpha=0.5, label='80% threshold')
+        ax2.set_xlabel('Number of Components')
+        ax2.set_ylabel('Cumulative Explained Variance')
+        ax2.set_title('Cumulative Variance Explained')
+        ax2.legend()
+        ax2.grid(True, alpha=0.3)
+
+        self.save_plot('pca_variance.png')
+
+        # 2D visualization
+        plt.figure(figsize=(12, 10))
+        plt.scatter(pca_coords[:, 0], pca_coords[:, 1], alpha=0.6, s=50)
+
+        # Annotate points
+        for i, protocol in enumerate(self.df['Descriptor']):
+            if i % 3 == 0:  # Label every 3rd point to avoid clutter
+                plt.annotate(protocol, (pca_coords[i, 0], pca_coords[i, 1]),
+                           fontsize=6, alpha=0.7)
+
+        plt.xlabel(f'PC1 ({explained_var[0]*100:.1f}% variance)')
+        plt.ylabel(f'PC2 ({explained_var[1]*100:.1f}% variance)')
+        plt.title('Protocols in PCA Space (First 2 Components)')
+        plt.grid(True, alpha=0.3)
+        self.save_plot('pca_2d.png')
+
+        # Save PCA coordinates
+        pca_df = pd.DataFrame(pca_coords[:, :5],
+                             columns=[f'PC{i+1}' for i in range(min(5, pca_coords.shape[1]))])
+        pca_df.insert(0, 'Descriptor', self.df['Descriptor'])
+        self.save_results(pca_df, 'pca_coordinates.csv')
+
+        # Component loadings
+        loadings = pd.DataFrame(
+            pca.components_[:5, :].T,
+            columns=[f'PC{i+1}' for i in range(min(5, pca.components_.shape[0]))],
+            index=self.dimension_cols
+        )
+        self.save_results(loadings, 'pca_loadings.csv')
+
+        # Plot loadings heatmap
+        plt.figure(figsize=(10, 12))
+        sns.heatmap(loadings, cmap='RdBu_r', center=0, cbar_kws={'label': 'Loading'})
+        plt.title('PCA Component Loadings')
+        plt.tight_layout()
+        self.save_plot('pca_loadings_heatmap.png')
+
+        return pca_coords, pca
+
+    def tsne_analysis(self, perplexity=30, n_components=2):
+        """Perform t-SNE analysis."""
+        print("\n=== t-SNE Analysis ===")
+
+        tsne = TSNE(n_components=n_components, perplexity=perplexity, random_state=42, max_iter=1000)
+        tsne_coords = tsne.fit_transform(self.scaled_data)
+
+        # Plot
+        plt.figure(figsize=(12, 10))
+        plt.scatter(tsne_coords[:, 0], tsne_coords[:, 1], alpha=0.6, s=50)
+
+        # Annotate some points
+        for i, protocol in enumerate(self.df['Descriptor']):
+            if i % 4 == 0:  # Label every 4th point
+                plt.annotate(protocol, (tsne_coords[i, 0], tsne_coords[i, 1]),
+                           fontsize=6, alpha=0.7)
+
+        plt.xlabel('t-SNE Dimension 1')
+        plt.ylabel('t-SNE Dimension 2')
+        plt.title(f't-SNE Projection (perplexity={perplexity})')
+        plt.grid(True, alpha=0.3)
+        self.save_plot('tsne_2d.png')
+
+        # Save coordinates
+        tsne_df = pd.DataFrame(tsne_coords, columns=['TSNE1', 'TSNE2'])
+        tsne_df.insert(0, 'Descriptor', self.df['Descriptor'])
+        self.save_results(tsne_df, 'tsne_coordinates.csv')
+
+        return tsne_coords
+
+    def umap_analysis(self, n_neighbors=15, min_dist=0.1, n_components=2):
+        """Perform UMAP analysis if available."""
+        if not UMAP_AVAILABLE:
+            print("\n=== UMAP Analysis ===")
+            print("UMAP not available. Install with: pip install umap-learn")
+            return None
+
+        print("\n=== UMAP Analysis ===")
+
+        reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist,
+                           n_components=n_components, random_state=42)
+        umap_coords = reducer.fit_transform(self.scaled_data)
+
+        # Plot
+        plt.figure(figsize=(12, 10))
+        plt.scatter(umap_coords[:, 0], umap_coords[:, 1], alpha=0.6, s=50)
+
+        # Annotate some points
+        for i, protocol in enumerate(self.df['Descriptor']):
+            if i % 4 == 0:
+                plt.annotate(protocol, (umap_coords[i, 0], umap_coords[i, 1]),
+                           fontsize=6, alpha=0.7)
+
+        plt.xlabel('UMAP Dimension 1')
+        plt.ylabel('UMAP Dimension 2')
+        plt.title(f'UMAP Projection (n_neighbors={n_neighbors}, min_dist={min_dist})')
+        plt.grid(True, alpha=0.3)
+        self.save_plot('umap_2d.png')
+
+        # Save coordinates
+        umap_df = pd.DataFrame(umap_coords, columns=['UMAP1', 'UMAP2'])
+        umap_df.insert(0, 'Descriptor', self.df['Descriptor'])
+        self.save_results(umap_df, 'umap_coordinates.csv')
+
+        return umap_coords
+
+    def factor_analysis(self, n_factors=5):
+        """Perform factor analysis."""
+        print("\n=== Factor Analysis ===")
+
+        fa = FactorAnalysis(n_components=n_factors, random_state=42)
+        fa_coords = fa.fit_transform(self.scaled_data)
+
+        # Factor loadings
+        loadings = pd.DataFrame(
+            fa.components_.T,
+            columns=[f'Factor{i+1}' for i in range(n_factors)],
+            index=self.dimension_cols
+        )
+        self.save_results(loadings, 'factor_loadings.csv')
+
+        # Plot loadings heatmap
+        plt.figure(figsize=(10, 12))
+        sns.heatmap(loadings, cmap='RdBu_r', center=0, cbar_kws={'label': 'Loading'})
+        plt.title('Factor Analysis Loadings')
+        plt.tight_layout()
+        self.save_plot('factor_loadings_heatmap.png')
+
+        # Save factor scores
+        fa_df = pd.DataFrame(fa_coords,
+                            columns=[f'Factor{i+1}' for i in range(n_factors)])
+        fa_df.insert(0, 'Descriptor', self.df['Descriptor'])
+        self.save_results(fa_df, 'factor_scores.csv')
+
+        return fa_coords, fa
+
+    # ========== CORRELATION & STRUCTURE ==========
+
+    def correlation_analysis(self):
+        """Compute and visualize correlation matrices."""
+        print("\n=== Correlation Analysis ===")
+
+        # Full correlation matrix
+        corr_matrix = self.df[self.dimension_cols].corr()
+
+        # Plot full correlation heatmap
+        plt.figure(figsize=(16, 14))
+        sns.heatmap(corr_matrix, cmap='RdBu_r', center=0,
+                   square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'})
+        plt.title('Correlation Matrix - All Dimensions')
+        plt.tight_layout()
+        self.save_plot('correlation_heatmap_full.png')
+
+        # Save correlation matrix
+        self.save_results(corr_matrix, 'correlation_matrix.csv')
+
+        # Find strongest correlations
+        corr_pairs = []
+        for i in range(len(corr_matrix.columns)):
+            for j in range(i+1, len(corr_matrix.columns)):
+                corr_pairs.append({
+                    'Dimension1': corr_matrix.columns[i],
+                    'Dimension2': corr_matrix.columns[j],
+                    'Correlation': corr_matrix.iloc[i, j]
+                })
+
+        corr_df = pd.DataFrame(corr_pairs).sort_values('Correlation',
+                                                       key=abs,
+                                                       ascending=False)
+        self.save_results(corr_df.head(20), 'top_correlations.csv')
+
+        print("\nTop 5 positive correlations:")
+        for _, row in corr_df.head(5).iterrows():
+            print(f"  {row['Dimension1']} <-> {row['Dimension2']}: {row['Correlation']:.3f}")
+
+        print("\nTop 5 negative correlations:")
+        for _, row in corr_df.tail(5).iterrows():
+            print(f"  {row['Dimension1']} <-> {row['Dimension2']}: {row['Correlation']:.3f}")
+
+        # Within-category correlations
+        self._plot_category_correlation('Design', self.design_cols)
+        self._plot_category_correlation('Entanglement', self.entanglement_cols)
+        self._plot_category_correlation('Experience', self.experience_cols)
+
+        return corr_matrix
+
+    def _plot_category_correlation(self, category_name, columns):
+        """Plot correlation heatmap for a specific category."""
+        corr = self.df[columns].corr()
+
+        plt.figure(figsize=(10, 8))
+        sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r', center=0,
+                   square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'})
+        plt.title(f'{category_name} Dimensions - Correlation Matrix')
+        plt.tight_layout()
+        self.save_plot(f'correlation_heatmap_{category_name.lower()}.png')
+
+    def network_analysis(self, threshold=0.5):
+        """Create network graph of protocol similarities."""
+        print("\n=== Network Analysis ===")
+
+        # Compute pairwise distances
+        distances = pdist(self.scaled_data, metric='euclidean')
+        dist_matrix = squareform(distances)
+
+        # Convert to similarity (inverse of distance, normalized)
+        max_dist = dist_matrix.max()
+        similarity_matrix = 1 - (dist_matrix / max_dist)
+
+        # Create network
+        G = nx.Graph()
+
+        # Add nodes
+        for i, protocol in enumerate(self.df['Descriptor']):
+            G.add_node(i, label=protocol)
+
+        # Add edges above threshold
+        edge_count = 0
+        for i in range(len(similarity_matrix)):
+            for j in range(i+1, len(similarity_matrix)):
+                if similarity_matrix[i, j] > threshold:
+                    G.add_edge(i, j, weight=similarity_matrix[i, j])
+                    edge_count += 1
+
+        print(f"Network with {G.number_of_nodes()} nodes and {edge_count} edges")
+
+        # Calculate network metrics
+        if G.number_of_edges() > 0:
+            degree_centrality = nx.degree_centrality(G)
+            betweenness = nx.betweenness_centrality(G)
+
+            metrics_df = pd.DataFrame({
+                'Descriptor': [self.df.iloc[i]['Descriptor'] for i in G.nodes()],
+                'Degree_Centrality': [degree_centrality[i] for i in G.nodes()],
+                'Betweenness_Centrality': [betweenness[i] for i in G.nodes()]
+            }).sort_values('Degree_Centrality', ascending=False)
+
+            self.save_results(metrics_df, 'network_metrics.csv')
+
+            print("\nTop 5 most central protocols:")
+            for _, row in metrics_df.head(5).iterrows():
+                print(f"  {row['Descriptor']}: {row['Degree_Centrality']:.3f}")
+
+            # Plot network
+            plt.figure(figsize=(16, 16))
+            pos = nx.spring_layout(G, k=0.5, iterations=50, seed=42)
+
+            # Node sizes based on degree centrality
+            node_sizes = [degree_centrality[i] * 3000 + 100 for i in G.nodes()]
+
+            nx.draw_networkx_nodes(G, pos, node_size=node_sizes,
+                                  node_color='lightblue', alpha=0.7)
+            nx.draw_networkx_edges(G, pos, alpha=0.2)
+
+            # Labels for high-centrality nodes
+            high_centrality = {i: self.df.iloc[i]['Descriptor']
+                             for i in G.nodes() if degree_centrality[i] > 0.1}
+            nx.draw_networkx_labels(G, pos, labels=high_centrality, font_size=8)
+
+            plt.title(f'Protocol Similarity Network (threshold={threshold})')
+            plt.axis('off')
+            plt.tight_layout()
+            self.save_plot('network_graph.png')
+        else:
+            print("No edges above threshold - try lowering the threshold")
+
+        return G
+
+    # ========== CLASSIFICATION & PREDICTION ==========
+
+    def category_discriminant_analysis(self):
+        """Analyze how well dimension categories discriminate protocols."""
+        print("\n=== Category Discriminant Analysis ===")
+
+        results = []
+
+        for category_name, columns in [('Design', self.design_cols),
+                                       ('Entanglement', self.entanglement_cols),
+                                       ('Experience', self.experience_cols)]:
+
+            # Use one category to predict clustering from another
+            X = self.df[columns].values
+
+            # Use kmeans clusters as target if available
+            if 'kmeans_cluster' in self.df.columns:
+                y = self.df['kmeans_cluster'].values
+
+                # LDA
+                try:
+                    lda = LinearDiscriminantAnalysis()
+                    lda.fit(X, y)
+                    score = lda.score(X, y)
+
+                    results.append({
+                        'Category': category_name,
+                        'Accuracy': score,
+                        'N_Dimensions': len(columns)
+                    })
+
+                    print(f"{category_name} dimensions predict clusters with {score*100:.1f}% accuracy")
+                except:
+                    print(f"Could not perform LDA for {category_name}")
+
+        if results:
+            results_df = pd.DataFrame(results)
+            self.save_results(results_df, 'category_discriminant_results.csv')
+
+        return results
+
+    def feature_importance_analysis(self):
+        """Analyze which dimensions are most important for clustering."""
+        print("\n=== Feature Importance Analysis ===")
+
+        if 'kmeans_cluster' not in self.df.columns:
+            print("Run clustering first to enable feature importance analysis")
+            return None
+
+        # Random Forest classifier
+        X = self.df[self.dimension_cols].values
+        y = self.df['kmeans_cluster'].values
+
+        rf = RandomForestClassifier(n_estimators=100, random_state=42)
+        rf.fit(X, y)
+
+        # Feature importances
+        importances = pd.DataFrame({
+            'Dimension': self.dimension_cols,
+            'Importance': rf.feature_importances_
+        }).sort_values('Importance', ascending=False)
+
+        self.save_results(importances, 'feature_importances.csv')
+
+        # Plot top 20
+        plt.figure(figsize=(10, 12))
+        top_20 = importances.head(20)
+        plt.barh(range(len(top_20)), top_20['Importance'])
+        plt.yticks(range(len(top_20)), top_20['Dimension'])
+        plt.xlabel('Importance')
+        plt.title('Top 20 Most Important Dimensions for Clustering')
+        plt.gca().invert_yaxis()
+        plt.tight_layout()
+        self.save_plot('feature_importances.png')
+
+        print("\nTop 10 most important dimensions:")
+        for _, row in importances.head(10).iterrows():
+            print(f"  {row['Dimension']}: {row['Importance']:.4f}")
+
+        return importances
+
+    def analyst_comparison(self):
+        """Compare ratings across different analysts."""
+        print("\n=== Analyst Comparison ===")
+
+        if 'analyst' not in self.df.columns:
+            print("No analyst column found")
+            return None
+
+        analysts = self.df['analyst'].unique()
+        print(f"Found {len(analysts)} unique analysts")
+
+        # Mean ratings by analyst for each dimension
+        analyst_means = self.df.groupby('analyst')[self.dimension_cols].mean()
+        self.save_results(analyst_means, 'analyst_mean_ratings.csv')
+
+        # Plot comparison
+        fig, axes = plt.subplots(3, 1, figsize=(14, 12))
+
+        for idx, (category_name, columns) in enumerate([
+            ('Design', self.design_cols),
+            ('Entanglement', self.entanglement_cols),
+            ('Experience', self.experience_cols)
+        ]):
+            analyst_means[columns].T.plot(ax=axes[idx], marker='o')
+            axes[idx].set_title(f'{category_name} Dimensions - Mean Ratings by Analyst')
+            axes[idx].set_ylabel('Mean Rating')
+            axes[idx].legend(title='Analyst', bbox_to_anchor=(1.05, 1), loc='upper left')
+            axes[idx].grid(True, alpha=0.3)
+            axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=45, ha='right')
+
+        plt.tight_layout()
+        self.save_plot('analyst_comparison.png')
+
+        return analyst_means
+
+    # ========== SUMMARY REPORT ==========
+
+    def generate_summary_report(self):
+        """Generate a text summary of all analyses."""
+        print("\n=== Generating Summary Report ===")
+
+        report_lines = []
+        report_lines.append("=" * 80)
+        report_lines.append("MULTIVARIATE ANALYSIS SUMMARY REPORT")
+        report_lines.append("Protocol Bicorder Dataset")
+        report_lines.append("=" * 80)
+        report_lines.append("")
+
+        report_lines.append(f"Dataset: {self.csv_path}")
+        report_lines.append(f"Number of protocols: {len(self.df)}")
+        report_lines.append(f"Number of dimensions: {len(self.dimension_cols)}")
+        report_lines.append(f"  - Design: {len(self.design_cols)}")
+        report_lines.append(f"  - Entanglement: {len(self.entanglement_cols)}")
+        report_lines.append(f"  - Experience: {len(self.experience_cols)}")
+        report_lines.append("")
+
+        report_lines.append("-" * 80)
+        report_lines.append("ANALYSES PERFORMED")
+        report_lines.append("-" * 80)
+
+        # Check which analyses were run
+        analyses_run = []
+
+        if 'kmeans_cluster' in self.df.columns:
+            analyses_run.append("- K-Means Clustering")
+            report_lines.append(f"K-Means: {len(self.df['kmeans_cluster'].unique())} clusters identified")
+
+        if 'hierarchical_cluster' in self.df.columns:
+            analyses_run.append("- Hierarchical Clustering")
+            report_lines.append(f"Hierarchical: {len(self.df['hierarchical_cluster'].unique())} clusters")
+
+        if 'dbscan_cluster' in self.df.columns:
+            analyses_run.append("- DBSCAN Clustering")
+            n_outliers = (self.df['dbscan_cluster'] == -1).sum()
+            report_lines.append(f"DBSCAN: {n_outliers} outlier protocols identified")
+
+        report_lines.append("")
+        report_lines.append("Dimensionality Reduction:")
+        report_lines.append("- Principal Component Analysis (PCA)")
+        report_lines.append("- t-SNE Projection")
+        if UMAP_AVAILABLE:
+            report_lines.append("- UMAP Projection")
+        report_lines.append("- Factor Analysis")
+        report_lines.append("")
+
+        report_lines.append("Statistical Analyses:")
+        report_lines.append("- Correlation Analysis")
+        report_lines.append("- Network Analysis")
+        report_lines.append("- Feature Importance Analysis")
+
+        if 'analyst' in self.df.columns:
+            report_lines.append("- Analyst Comparison")
+
+        report_lines.append("")
+        report_lines.append("-" * 80)
+        report_lines.append("OUTPUT FILES")
+        report_lines.append("-" * 80)
+        report_lines.append(f"All results saved to: {self.output_dir}/")
+        report_lines.append("  - plots/ : All visualizations (PNG)")
+        report_lines.append("  - data/ : All numerical results (CSV)")
+        report_lines.append("  - reports/ : This summary report")
+        report_lines.append("")
+
+        report_lines.append("=" * 80)
+        report_lines.append("END OF REPORT")
+        report_lines.append("=" * 80)
+
+        report_text = "\n".join(report_lines)
+
+        # Save report
+        report_path = self.output_dir / 'reports' / 'analysis_summary.txt'
+        with open(report_path, 'w') as f:
+            f.write(report_text)
+
+        print(f"  Saved: {report_path}")
+        print("\n" + report_text)
+
+        return report_text
+
+
+def main():
+    """Main execution function."""
+    parser = argparse.ArgumentParser(
+        description='Multivariate analysis of Protocol Bicorder data',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv
+  python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv --output data/readings/synthetic_20251116/analysis
+  python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv --analyses clustering pca
+        """
+    )
+
+    parser.add_argument('csv_file', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
+    parser.add_argument('--output', '-o', default=None,
+                       help='Output directory (default: <dataset_dir>/analysis)')
+    parser.add_argument('--min-coverage', type=float, default=0.0,
+                       help='Drop dimension columns below this coverage fraction (0.0–1.0). '
+                            'E.g. 0.8 keeps only columns ≥80%% complete. '
+                            'Useful for sparse/shortform datasets (default: 0.0, keep all)')
+    parser.add_argument('--analyses', nargs='+',
+                       choices=['clustering', 'pca', 'tsne', 'umap', 'factor',
+                               'correlation', 'network', 'importance', 'analyst', 'all'],
+                       default=['all'],
+                       help='Which analyses to run (default: all)')
+
+    args = parser.parse_args()
+
+    # Check if file exists
+    if not Path(args.csv_file).exists():
+        print(f"Error: File not found: {args.csv_file}")
+        sys.exit(1)
+
+    # Derive output dir from dataset dir if not specified
+    output_dir = args.output if args.output else str(Path(args.csv_file).parent / 'analysis')
+
+    # Initialize analyzer
+    print("=" * 80)
+    print("PROTOCOL BICORDER - MULTIVARIATE ANALYSIS")
+    print("=" * 80)
+
+    analyzer = ProtocolAnalyzer(args.csv_file, output_dir, min_coverage=args.min_coverage)
+
+    # Determine which analyses to run
+    run_all = 'all' in args.analyses
+
+    # Run analyses
+    try:
+        # Clustering
+        if run_all or 'clustering' in args.analyses:
+            analyzer.kmeans_clustering()
+            analyzer.hierarchical_clustering()
+            analyzer.dbscan_clustering()
+
+        # Dimensionality reduction
+        if run_all or 'pca' in args.analyses:
+            analyzer.pca_analysis()
+
+        if run_all or 'tsne' in args.analyses:
+            analyzer.tsne_analysis()
+
+        if run_all or 'umap' in args.analyses:
+            analyzer.umap_analysis()
+
+        if run_all or 'factor' in args.analyses:
+            analyzer.factor_analysis()
+
+        # Correlation and structure
+        if run_all or 'correlation' in args.analyses:
+            analyzer.correlation_analysis()
+
+        if run_all or 'network' in args.analyses:
+            analyzer.network_analysis(threshold=0.6)
+
+        # Classification
+        if run_all or 'importance' in args.analyses:
+            analyzer.category_discriminant_analysis()
+            analyzer.feature_importance_analysis()
+
+        if run_all or 'analyst' in args.analyses:
+            analyzer.analyst_comparison()
+
+        # Generate summary
+        analyzer.generate_summary_report()
+
+        print("\n" + "=" * 80)
+        print("ANALYSIS COMPLETE!")
+        print("=" * 80)
+        print(f"\nAll results saved to: {analyzer.output_dir}/")
+
+    except Exception as e:
+        print(f"\nError during analysis: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()