#!/usr/bin/env python3 """ Create visualizations of k-means clusters overlaid on dimensionality reduction plots. """ import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from pathlib import Path # Set up paths output_dir = Path('analysis_results') plots_dir = output_dir / 'plots' data_dir = output_dir / 'data' # Load cluster assignments clusters = pd.read_csv(data_dir / 'kmeans_clusters.csv') clusters['cluster'] = clusters['cluster'] # Already 1-indexed # Load dimensionality reduction coordinates pca_coords = pd.read_csv(data_dir / 'pca_coordinates.csv') tsne_coords = pd.read_csv(data_dir / 'tsne_coordinates.csv') # Merge cluster assignments with coordinates pca_data = pca_coords.merge(clusters, on='Descriptor') tsne_data = tsne_coords.merge(clusters, on='Descriptor') # Set up color scheme colors = {1: '#2E86AB', 2: '#A23B72'} # Blue for cluster 1, Purple for cluster 2 cluster_names = {1: 'Cluster 1: Relational/Cultural', 2: 'Cluster 2: Institutional/Bureaucratic'} # ========== PCA Plot with Clusters ========== print("Creating PCA plot with cluster colors...") fig, ax = plt.subplots(figsize=(14, 12)) for cluster_id in [1, 2]: cluster_data = pca_data[pca_data['cluster'] == cluster_id] ax.scatter(cluster_data['PC1'], cluster_data['PC2'], c=colors[cluster_id], label=cluster_names[cluster_id], alpha=0.6, s=60, edgecolors='white', linewidth=0.5) # Annotate some representative protocols from each cluster # Sample a few protocols from each cluster for labeling for cluster_id in [1, 2]: cluster_data = pca_data[pca_data['cluster'] == cluster_id] # Label every 8th protocol to avoid clutter for i, row in cluster_data.iterrows(): if i % 8 == 0: ax.annotate(row['Descriptor'], (row['PC1'], row['PC2']), fontsize=7, alpha=0.7, xytext=(5, 5), textcoords='offset points') ax.set_xlabel('PC1 (22.5% variance)', fontsize=12) ax.set_ylabel('PC2 (22.7% variance)', fontsize=12) ax.set_title('K-Means Clusters in PCA Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold') ax.legend(loc='best', fontsize=10, framealpha=0.9) ax.grid(True, alpha=0.3) plt.tight_layout() plt.savefig(plots_dir / 'pca_2d_clustered.png', dpi=300, bbox_inches='tight') print(f" Saved: {plots_dir / 'pca_2d_clustered.png'}") plt.close() # ========== t-SNE Plot with Clusters ========== print("Creating t-SNE plot with cluster colors...") fig, ax = plt.subplots(figsize=(14, 12)) for cluster_id in [1, 2]: cluster_data = tsne_data[tsne_data['cluster'] == cluster_id] ax.scatter(cluster_data['TSNE1'], cluster_data['TSNE2'], c=colors[cluster_id], label=cluster_names[cluster_id], alpha=0.6, s=60, edgecolors='white', linewidth=0.5) # Annotate some protocols for cluster_id in [1, 2]: cluster_data = tsne_data[tsne_data['cluster'] == cluster_id] for i, row in cluster_data.iterrows(): if i % 8 == 0: ax.annotate(row['Descriptor'], (row['TSNE1'], row['TSNE2']), fontsize=7, alpha=0.7, xytext=(5, 5), textcoords='offset points') ax.set_xlabel('t-SNE Dimension 1', fontsize=12) ax.set_ylabel('t-SNE Dimension 2', fontsize=12) ax.set_title('K-Means Clusters in t-SNE Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold') ax.legend(loc='best', fontsize=10, framealpha=0.9) ax.grid(True, alpha=0.3) plt.tight_layout() plt.savefig(plots_dir / 'tsne_2d_clustered.png', dpi=300, bbox_inches='tight') print(f" Saved: {plots_dir / 'tsne_2d_clustered.png'}") plt.close() # ========== UMAP Plot with Clusters (if available) ========== umap_path = data_dir / 'umap_coordinates.csv' if umap_path.exists(): print("Creating UMAP plot with cluster colors...") umap_coords = pd.read_csv(umap_path) umap_data = umap_coords.merge(clusters, on='Descriptor') fig, ax = plt.subplots(figsize=(14, 12)) for cluster_id in [1, 2]: cluster_data = umap_data[umap_data['cluster'] == cluster_id] ax.scatter(cluster_data['UMAP1'], cluster_data['UMAP2'], c=colors[cluster_id], label=cluster_names[cluster_id], alpha=0.6, s=60, edgecolors='white', linewidth=0.5) # Annotate some protocols for cluster_id in [1, 2]: cluster_data = umap_data[umap_data['cluster'] == cluster_id] for i, row in cluster_data.iterrows(): if i % 8 == 0: ax.annotate(row['Descriptor'], (row['UMAP1'], row['UMAP2']), fontsize=7, alpha=0.7, xytext=(5, 5), textcoords='offset points') ax.set_xlabel('UMAP Dimension 1', fontsize=12) ax.set_ylabel('UMAP Dimension 2', fontsize=12) ax.set_title('K-Means Clusters in UMAP Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold') ax.legend(loc='best', fontsize=10, framealpha=0.9) ax.grid(True, alpha=0.3) plt.tight_layout() plt.savefig(plots_dir / 'umap_2d_clustered.png', dpi=300, bbox_inches='tight') print(f" Saved: {plots_dir / 'umap_2d_clustered.png'}") plt.close() # ========== Summary Statistics ========== print("\n=== Cluster Summary ===") print(f"Total protocols: {len(clusters)}") print(f"\nCluster 1 (Relational/Cultural): {len(clusters[clusters['cluster'] == 1])} protocols") print(f"Cluster 2 (Institutional/Bureaucratic): {len(clusters[clusters['cluster'] == 2])} protocols") print("\nSample protocols from each cluster:") print("\nCluster 1 (Relational/Cultural):") for protocol in clusters[clusters['cluster'] == 1]['Descriptor'].head(10): print(f" - {protocol}") print("\nCluster 2 (Institutional/Bureaucratic):") for protocol in clusters[clusters['cluster'] == 2]['Descriptor'].head(10): print(f" - {protocol}") print("\n=== Visualization Complete! ===")