Reorganize directory, add manual dataset and sync tooling

- Move all scripts to scripts/, web assets to web/, analysis results into self-contained data/readings/<type>_<YYYYMMDD>/ directories - Add data/readings/manual_20260320/ with 32 JSON readings from git.medlab.host/ntnsndr/protocol-bicorder-data - Add scripts/json_to_csv.py to convert bicorder JSON files to CSV - Add scripts/sync_readings.sh for one-command sync + re-analysis of any dataset backed by a .sync_source config file - Add scripts/classify_readings.py to apply the LDA classifier to all readings and save per-reading cluster assignments - Add --min-coverage flag to multivariate_analysis.py for sparse/shortform datasets; also applies in lda_visualization.py - Fix lda_visualization.py NaN handling and 0-d array annotation bug - Update README.md and WORKFLOW.md to document datasets, sync workflow, shortform handling, and new scripts Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-20 17:35:13 -06:00
parent 0c794dddae
commit 897c30406b
545 changed files with 10715 additions and 718 deletions
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""
+Create visualizations of k-means clusters overlaid on dimensionality reduction plots.
+
+Usage:
+  python3 scripts/visualize_clusters.py data/readings/synthetic_20251116.csv
+  python3 scripts/visualize_clusters.py data/readings/manual_20260101.csv --results-dir analysis_results/manual_20260101
+"""
+
+import argparse
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from pathlib import Path
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Visualize k-means clusters in PCA/t-SNE/UMAP space',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Example usage:
+  python3 scripts/visualize_clusters.py data/readings/synthetic_20251116/readings.csv
+  python3 scripts/visualize_clusters.py data/readings/manual_20260101/readings.csv --analysis-dir data/readings/manual_20260101/analysis
+        """
+    )
+    parser.add_argument('input_csv', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
+    parser.add_argument('--analysis-dir', default=None,
+                        help='Analysis directory (default: <dataset_dir>/analysis)')
+    args = parser.parse_args()
+
+    dataset_dir = Path(args.input_csv).parent
+    results_dir = Path(args.analysis_dir) if args.analysis_dir else dataset_dir / 'analysis'
+    plots_dir = results_dir / 'plots'
+    data_dir = results_dir / 'data'
+
+    # Load cluster assignments
+    clusters = pd.read_csv(data_dir / 'kmeans_clusters.csv')
+    clusters['cluster'] = clusters['cluster']  # Already 1-indexed
+
+    # Load dimensionality reduction coordinates
+    pca_coords = pd.read_csv(data_dir / 'pca_coordinates.csv')
+    tsne_coords = pd.read_csv(data_dir / 'tsne_coordinates.csv')
+
+    # Merge cluster assignments with coordinates
+    pca_data = pca_coords.merge(clusters, on='Descriptor')
+    tsne_data = tsne_coords.merge(clusters, on='Descriptor')
+
+    # Set up color scheme
+    colors = {1: '#2E86AB', 2: '#A23B72'}  # Blue for cluster 1, Purple for cluster 2
+    cluster_names = {1: 'Cluster 1: Relational/Cultural', 2: 'Cluster 2: Institutional/Bureaucratic'}
+
+    # ========== PCA Plot with Clusters ==========
+    print("Creating PCA plot with cluster colors...")
+    fig, ax = plt.subplots(figsize=(14, 12))
+
+    for cluster_id in [1, 2]:
+        cluster_data = pca_data[pca_data['cluster'] == cluster_id]
+        ax.scatter(cluster_data['PC1'], cluster_data['PC2'],
+                  c=colors[cluster_id], label=cluster_names[cluster_id],
+                  alpha=0.6, s=60, edgecolors='white', linewidth=0.5)
+
+    for cluster_id in [1, 2]:
+        cluster_data = pca_data[pca_data['cluster'] == cluster_id]
+        for i, row in cluster_data.iterrows():
+            if i % 8 == 0:
+                ax.annotate(row['Descriptor'],
+                           (row['PC1'], row['PC2']),
+                           fontsize=7, alpha=0.7,
+                           xytext=(5, 5), textcoords='offset points')
+
+    ax.set_xlabel('PC1 (22.5% variance)', fontsize=12)
+    ax.set_ylabel('PC2 (22.7% variance)', fontsize=12)
+    ax.set_title('K-Means Clusters in PCA Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold')
+    ax.legend(loc='best', fontsize=10, framealpha=0.9)
+    ax.grid(True, alpha=0.3)
+
+    plt.tight_layout()
+    plt.savefig(plots_dir / 'pca_2d_clustered.png', dpi=300, bbox_inches='tight')
+    print(f"  Saved: {plots_dir / 'pca_2d_clustered.png'}")
+    plt.close()
+
+    # ========== t-SNE Plot with Clusters ==========
+    print("Creating t-SNE plot with cluster colors...")
+    fig, ax = plt.subplots(figsize=(14, 12))
+
+    for cluster_id in [1, 2]:
+        cluster_data = tsne_data[tsne_data['cluster'] == cluster_id]
+        ax.scatter(cluster_data['TSNE1'], cluster_data['TSNE2'],
+                  c=colors[cluster_id], label=cluster_names[cluster_id],
+                  alpha=0.6, s=60, edgecolors='white', linewidth=0.5)
+
+    for cluster_id in [1, 2]:
+        cluster_data = tsne_data[tsne_data['cluster'] == cluster_id]
+        for i, row in cluster_data.iterrows():
+            if i % 8 == 0:
+                ax.annotate(row['Descriptor'],
+                           (row['TSNE1'], row['TSNE2']),
+                           fontsize=7, alpha=0.7,
+                           xytext=(5, 5), textcoords='offset points')
+
+    ax.set_xlabel('t-SNE Dimension 1', fontsize=12)
+    ax.set_ylabel('t-SNE Dimension 2', fontsize=12)
+    ax.set_title('K-Means Clusters in t-SNE Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold')
+    ax.legend(loc='best', fontsize=10, framealpha=0.9)
+    ax.grid(True, alpha=0.3)
+
+    plt.tight_layout()
+    plt.savefig(plots_dir / 'tsne_2d_clustered.png', dpi=300, bbox_inches='tight')
+    print(f"  Saved: {plots_dir / 'tsne_2d_clustered.png'}")
+    plt.close()
+
+    # ========== UMAP Plot with Clusters (if available) ==========
+    umap_path = data_dir / 'umap_coordinates.csv'
+    if umap_path.exists():
+        print("Creating UMAP plot with cluster colors...")
+        umap_coords = pd.read_csv(umap_path)
+        umap_data = umap_coords.merge(clusters, on='Descriptor')
+
+        fig, ax = plt.subplots(figsize=(14, 12))
+
+        for cluster_id in [1, 2]:
+            cluster_data = umap_data[umap_data['cluster'] == cluster_id]
+            ax.scatter(cluster_data['UMAP1'], cluster_data['UMAP2'],
+                      c=colors[cluster_id], label=cluster_names[cluster_id],
+                      alpha=0.6, s=60, edgecolors='white', linewidth=0.5)
+
+        for cluster_id in [1, 2]:
+            cluster_data = umap_data[umap_data['cluster'] == cluster_id]
+            for i, row in cluster_data.iterrows():
+                if i % 8 == 0:
+                    ax.annotate(row['Descriptor'],
+                               (row['UMAP1'], row['UMAP2']),
+                               fontsize=7, alpha=0.7,
+                               xytext=(5, 5), textcoords='offset points')
+
+        ax.set_xlabel('UMAP Dimension 1', fontsize=12)
+        ax.set_ylabel('UMAP Dimension 2', fontsize=12)
+        ax.set_title('K-Means Clusters in UMAP Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold')
+        ax.legend(loc='best', fontsize=10, framealpha=0.9)
+        ax.grid(True, alpha=0.3)
+
+        plt.tight_layout()
+        plt.savefig(plots_dir / 'umap_2d_clustered.png', dpi=300, bbox_inches='tight')
+        print(f"  Saved: {plots_dir / 'umap_2d_clustered.png'}")
+        plt.close()
+
+    # ========== Summary Statistics ==========
+    print("\n=== Cluster Summary ===")
+    print(f"Total protocols: {len(clusters)}")
+    print(f"\nCluster 1 (Relational/Cultural): {len(clusters[clusters['cluster'] == 1])} protocols")
+    print(f"Cluster 2 (Institutional/Bureaucratic): {len(clusters[clusters['cluster'] == 2])} protocols")
+
+    print("\nSample protocols from each cluster:")
+    print("\nCluster 1 (Relational/Cultural):")
+    for protocol in clusters[clusters['cluster'] == 1]['Descriptor'].head(10):
+        print(f"  - {protocol}")
+
+    print("\nCluster 2 (Institutional/Bureaucratic):")
+    for protocol in clusters[clusters['cluster'] == 2]['Descriptor'].head(10):
+        print(f"  - {protocol}")
+
+    print("\n=== Visualization Complete! ===")
+
+
+if __name__ == '__main__':
+    main()