Additional analysis: examining clustering via LDA
This commit is contained in:
146
analysis/lda_visualization.py
Normal file
146
analysis/lda_visualization.py
Normal file
@@ -0,0 +1,146 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Create LDA visualization to maximize cluster separation.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from pathlib import Path
|
||||
|
||||
# Set up paths
|
||||
output_dir = Path('analysis_results')
|
||||
plots_dir = output_dir / 'plots'
|
||||
data_dir = output_dir / 'data'
|
||||
|
||||
# Load the original data
|
||||
df = pd.read_csv('diagnostic_output.csv')
|
||||
|
||||
# Identify dimension columns
|
||||
all_cols = df.columns.tolist()
|
||||
design_cols = [c for c in all_cols if c.startswith('Design_')]
|
||||
entanglement_cols = [c for c in all_cols if c.startswith('Entanglement_')]
|
||||
experience_cols = [c for c in all_cols if c.startswith('Experience_')]
|
||||
dimension_cols = design_cols + entanglement_cols + experience_cols
|
||||
|
||||
# Load cluster assignments
|
||||
clusters = pd.read_csv(data_dir / 'kmeans_clusters.csv')
|
||||
df_with_clusters = df.merge(clusters, on='Descriptor')
|
||||
|
||||
# Prepare data
|
||||
X = df_with_clusters[dimension_cols].values
|
||||
y = df_with_clusters['cluster'].values
|
||||
|
||||
# Standardize
|
||||
scaler = StandardScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
|
||||
# Fit LDA (with 1 component for 2 classes)
|
||||
lda = LinearDiscriminantAnalysis(n_components=1)
|
||||
X_lda = lda.fit_transform(X_scaled, y)
|
||||
|
||||
# Create histogram showing separation
|
||||
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))
|
||||
|
||||
# Histogram
|
||||
colors = {1: '#2E86AB', 2: '#A23B72'}
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = X_lda[y == cluster_id]
|
||||
ax1.hist(cluster_data, bins=30, alpha=0.6,
|
||||
color=colors[cluster_id],
|
||||
label=f'Cluster {cluster_id}',
|
||||
edgecolor='white', linewidth=0.5)
|
||||
|
||||
ax1.set_xlabel('Linear Discriminant (LD1)', fontsize=12)
|
||||
ax1.set_ylabel('Frequency', fontsize=12)
|
||||
ax1.set_title('Linear Discriminant Analysis: Cluster Separation\n(Maximum separation projection)',
|
||||
fontsize=14, fontweight='bold')
|
||||
ax1.legend(fontsize=11)
|
||||
ax1.grid(True, alpha=0.3, axis='y')
|
||||
|
||||
# Strip plot - shows individual protocols
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = X_lda[y == cluster_id]
|
||||
cluster_protocols = df_with_clusters[df_with_clusters['cluster'] == cluster_id]['Descriptor'].values
|
||||
|
||||
# Add jitter for visibility
|
||||
y_jitter = np.random.normal(cluster_id, 0.1, size=len(cluster_data))
|
||||
|
||||
ax2.scatter(cluster_data, y_jitter,
|
||||
c=colors[cluster_id], alpha=0.5, s=40,
|
||||
edgecolors='white', linewidth=0.3)
|
||||
|
||||
# Label a few representative protocols
|
||||
for i in range(0, len(cluster_data), 25):
|
||||
ax2.annotate(cluster_protocols[i],
|
||||
(cluster_data[i], y_jitter[i]),
|
||||
fontsize=7, alpha=0.7,
|
||||
xytext=(0, 5), textcoords='offset points',
|
||||
rotation=45, ha='left')
|
||||
|
||||
ax2.set_xlabel('Linear Discriminant (LD1)', fontsize=12)
|
||||
ax2.set_ylabel('Cluster', fontsize=12)
|
||||
ax2.set_yticks([1, 2])
|
||||
ax2.set_yticklabels(['Cluster 1:\nRelational/Cultural', 'Cluster 2:\nInstitutional/Bureaucratic'])
|
||||
ax2.set_title('Individual Protocols Projected onto Discriminant Axis', fontsize=12)
|
||||
ax2.grid(True, alpha=0.3, axis='x')
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(plots_dir / 'lda_cluster_separation.png', dpi=300, bbox_inches='tight')
|
||||
print(f"Saved: {plots_dir / 'lda_cluster_separation.png'}")
|
||||
|
||||
# Calculate separation metrics
|
||||
mean_1 = X_lda[y == 1].mean()
|
||||
mean_2 = X_lda[y == 2].mean()
|
||||
std_1 = X_lda[y == 1].std()
|
||||
std_2 = X_lda[y == 2].std()
|
||||
|
||||
# Cohen's d (effect size)
|
||||
pooled_std = np.sqrt((std_1**2 + std_2**2) / 2)
|
||||
cohens_d = abs(mean_1 - mean_2) / pooled_std
|
||||
|
||||
print(f"\n=== Cluster Separation Statistics ===")
|
||||
mean_1_val = mean_1[0] if isinstance(mean_1, np.ndarray) else mean_1
|
||||
mean_2_val = mean_2[0] if isinstance(mean_2, np.ndarray) else mean_2
|
||||
cohens_d_val = cohens_d[0] if isinstance(cohens_d, np.ndarray) else cohens_d
|
||||
print(f"Cluster 1 mean: {mean_1_val:.3f} (std: {std_1:.3f})")
|
||||
print(f"Cluster 2 mean: {mean_2_val:.3f} (std: {std_2:.3f})")
|
||||
print(f"Distance between means: {abs(mean_1_val - mean_2_val):.3f}")
|
||||
print(f"Cohen's d (effect size): {cohens_d_val:.3f}")
|
||||
print(f" (0.2=small, 0.5=medium, 0.8=large effect)")
|
||||
|
||||
# Overlap percentage (rough estimate)
|
||||
overlap_start = max(X_lda[y == 1].min(), X_lda[y == 2].min())
|
||||
overlap_end = min(X_lda[y == 1].max(), X_lda[y == 2].max())
|
||||
overlap_range = overlap_end - overlap_start if overlap_end > overlap_start else 0
|
||||
total_range = max(X_lda.max() - X_lda.min())
|
||||
overlap_pct = (overlap_range / total_range) * 100 if overlap_range > 0 else 0
|
||||
|
||||
print(f"Approximate overlap: {overlap_pct:.1f}% of total range")
|
||||
|
||||
# Save LDA projection data
|
||||
lda_df = pd.DataFrame({
|
||||
'Descriptor': df_with_clusters['Descriptor'],
|
||||
'LD1': X_lda.flatten(),
|
||||
'Cluster': y
|
||||
})
|
||||
lda_df.to_csv(data_dir / 'lda_projection.csv', index=False)
|
||||
print(f"Saved: {data_dir / 'lda_projection.csv'}")
|
||||
|
||||
print("\n=== Most discriminating dimensions ===")
|
||||
# The LDA coefficients tell us which dimensions contribute most to separation
|
||||
loadings = pd.DataFrame({
|
||||
'Dimension': dimension_cols,
|
||||
'LDA_Coefficient': lda.coef_[0]
|
||||
})
|
||||
loadings['Abs_Coefficient'] = loadings['LDA_Coefficient'].abs()
|
||||
loadings = loadings.sort_values('Abs_Coefficient', ascending=False)
|
||||
|
||||
print("\nTop 10 dimensions that separate the clusters:")
|
||||
for _, row in loadings.head(10).iterrows():
|
||||
print(f" {row['Dimension']}: {row['LDA_Coefficient']:.3f}")
|
||||
|
||||
loadings.to_csv(data_dir / 'lda_coefficients.csv', index=False)
|
||||
print(f"\nSaved: {data_dir / 'lda_coefficients.csv'}")
|
||||
Reference in New Issue
Block a user