Additional analysis: examining clustering via LDA
This commit is contained in:
147
analysis/visualize_clusters.py
Normal file
147
analysis/visualize_clusters.py
Normal file
@@ -0,0 +1,147 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Create visualizations of k-means clusters overlaid on dimensionality reduction plots.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from pathlib import Path
|
||||
|
||||
# Set up paths
|
||||
output_dir = Path('analysis_results')
|
||||
plots_dir = output_dir / 'plots'
|
||||
data_dir = output_dir / 'data'
|
||||
|
||||
# Load cluster assignments
|
||||
clusters = pd.read_csv(data_dir / 'kmeans_clusters.csv')
|
||||
clusters['cluster'] = clusters['cluster'] # Already 1-indexed
|
||||
|
||||
# Load dimensionality reduction coordinates
|
||||
pca_coords = pd.read_csv(data_dir / 'pca_coordinates.csv')
|
||||
tsne_coords = pd.read_csv(data_dir / 'tsne_coordinates.csv')
|
||||
|
||||
# Merge cluster assignments with coordinates
|
||||
pca_data = pca_coords.merge(clusters, on='Descriptor')
|
||||
tsne_data = tsne_coords.merge(clusters, on='Descriptor')
|
||||
|
||||
# Set up color scheme
|
||||
colors = {1: '#2E86AB', 2: '#A23B72'} # Blue for cluster 1, Purple for cluster 2
|
||||
cluster_names = {1: 'Cluster 1: Relational/Cultural', 2: 'Cluster 2: Institutional/Bureaucratic'}
|
||||
|
||||
# ========== PCA Plot with Clusters ==========
|
||||
print("Creating PCA plot with cluster colors...")
|
||||
fig, ax = plt.subplots(figsize=(14, 12))
|
||||
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = pca_data[pca_data['cluster'] == cluster_id]
|
||||
ax.scatter(cluster_data['PC1'], cluster_data['PC2'],
|
||||
c=colors[cluster_id], label=cluster_names[cluster_id],
|
||||
alpha=0.6, s=60, edgecolors='white', linewidth=0.5)
|
||||
|
||||
# Annotate some representative protocols from each cluster
|
||||
# Sample a few protocols from each cluster for labeling
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = pca_data[pca_data['cluster'] == cluster_id]
|
||||
# Label every 8th protocol to avoid clutter
|
||||
for i, row in cluster_data.iterrows():
|
||||
if i % 8 == 0:
|
||||
ax.annotate(row['Descriptor'],
|
||||
(row['PC1'], row['PC2']),
|
||||
fontsize=7, alpha=0.7,
|
||||
xytext=(5, 5), textcoords='offset points')
|
||||
|
||||
ax.set_xlabel('PC1 (22.5% variance)', fontsize=12)
|
||||
ax.set_ylabel('PC2 (22.7% variance)', fontsize=12)
|
||||
ax.set_title('K-Means Clusters in PCA Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold')
|
||||
ax.legend(loc='best', fontsize=10, framealpha=0.9)
|
||||
ax.grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(plots_dir / 'pca_2d_clustered.png', dpi=300, bbox_inches='tight')
|
||||
print(f" Saved: {plots_dir / 'pca_2d_clustered.png'}")
|
||||
plt.close()
|
||||
|
||||
# ========== t-SNE Plot with Clusters ==========
|
||||
print("Creating t-SNE plot with cluster colors...")
|
||||
fig, ax = plt.subplots(figsize=(14, 12))
|
||||
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = tsne_data[tsne_data['cluster'] == cluster_id]
|
||||
ax.scatter(cluster_data['TSNE1'], cluster_data['TSNE2'],
|
||||
c=colors[cluster_id], label=cluster_names[cluster_id],
|
||||
alpha=0.6, s=60, edgecolors='white', linewidth=0.5)
|
||||
|
||||
# Annotate some protocols
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = tsne_data[tsne_data['cluster'] == cluster_id]
|
||||
for i, row in cluster_data.iterrows():
|
||||
if i % 8 == 0:
|
||||
ax.annotate(row['Descriptor'],
|
||||
(row['TSNE1'], row['TSNE2']),
|
||||
fontsize=7, alpha=0.7,
|
||||
xytext=(5, 5), textcoords='offset points')
|
||||
|
||||
ax.set_xlabel('t-SNE Dimension 1', fontsize=12)
|
||||
ax.set_ylabel('t-SNE Dimension 2', fontsize=12)
|
||||
ax.set_title('K-Means Clusters in t-SNE Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold')
|
||||
ax.legend(loc='best', fontsize=10, framealpha=0.9)
|
||||
ax.grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(plots_dir / 'tsne_2d_clustered.png', dpi=300, bbox_inches='tight')
|
||||
print(f" Saved: {plots_dir / 'tsne_2d_clustered.png'}")
|
||||
plt.close()
|
||||
|
||||
# ========== UMAP Plot with Clusters (if available) ==========
|
||||
umap_path = data_dir / 'umap_coordinates.csv'
|
||||
if umap_path.exists():
|
||||
print("Creating UMAP plot with cluster colors...")
|
||||
umap_coords = pd.read_csv(umap_path)
|
||||
umap_data = umap_coords.merge(clusters, on='Descriptor')
|
||||
|
||||
fig, ax = plt.subplots(figsize=(14, 12))
|
||||
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = umap_data[umap_data['cluster'] == cluster_id]
|
||||
ax.scatter(cluster_data['UMAP1'], cluster_data['UMAP2'],
|
||||
c=colors[cluster_id], label=cluster_names[cluster_id],
|
||||
alpha=0.6, s=60, edgecolors='white', linewidth=0.5)
|
||||
|
||||
# Annotate some protocols
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = umap_data[umap_data['cluster'] == cluster_id]
|
||||
for i, row in cluster_data.iterrows():
|
||||
if i % 8 == 0:
|
||||
ax.annotate(row['Descriptor'],
|
||||
(row['UMAP1'], row['UMAP2']),
|
||||
fontsize=7, alpha=0.7,
|
||||
xytext=(5, 5), textcoords='offset points')
|
||||
|
||||
ax.set_xlabel('UMAP Dimension 1', fontsize=12)
|
||||
ax.set_ylabel('UMAP Dimension 2', fontsize=12)
|
||||
ax.set_title('K-Means Clusters in UMAP Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold')
|
||||
ax.legend(loc='best', fontsize=10, framealpha=0.9)
|
||||
ax.grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(plots_dir / 'umap_2d_clustered.png', dpi=300, bbox_inches='tight')
|
||||
print(f" Saved: {plots_dir / 'umap_2d_clustered.png'}")
|
||||
plt.close()
|
||||
|
||||
# ========== Summary Statistics ==========
|
||||
print("\n=== Cluster Summary ===")
|
||||
print(f"Total protocols: {len(clusters)}")
|
||||
print(f"\nCluster 1 (Relational/Cultural): {len(clusters[clusters['cluster'] == 1])} protocols")
|
||||
print(f"Cluster 2 (Institutional/Bureaucratic): {len(clusters[clusters['cluster'] == 2])} protocols")
|
||||
|
||||
print("\nSample protocols from each cluster:")
|
||||
print("\nCluster 1 (Relational/Cultural):")
|
||||
for protocol in clusters[clusters['cluster'] == 1]['Descriptor'].head(10):
|
||||
print(f" - {protocol}")
|
||||
|
||||
print("\nCluster 2 (Institutional/Bureaucratic):")
|
||||
for protocol in clusters[clusters['cluster'] == 2]['Descriptor'].head(10):
|
||||
print(f" - {protocol}")
|
||||
|
||||
print("\n=== Visualization Complete! ===")
|
||||
Reference in New Issue
Block a user