#!/usr/bin/env python3 """ Create LDA visualization to maximize cluster separation. """ import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.preprocessing import StandardScaler from pathlib import Path # Set up paths output_dir = Path('analysis_results') plots_dir = output_dir / 'plots' data_dir = output_dir / 'data' # Load the original data df = pd.read_csv('diagnostic_output.csv') # Identify dimension columns all_cols = df.columns.tolist() design_cols = [c for c in all_cols if c.startswith('Design_')] entanglement_cols = [c for c in all_cols if c.startswith('Entanglement_')] experience_cols = [c for c in all_cols if c.startswith('Experience_')] dimension_cols = design_cols + entanglement_cols + experience_cols # Load cluster assignments clusters = pd.read_csv(data_dir / 'kmeans_clusters.csv') df_with_clusters = df.merge(clusters, on='Descriptor') # Prepare data X = df_with_clusters[dimension_cols].values y = df_with_clusters['cluster'].values # Standardize scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # Fit LDA (with 1 component for 2 classes) lda = LinearDiscriminantAnalysis(n_components=1) X_lda = lda.fit_transform(X_scaled, y) # Create histogram showing separation fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10)) # Histogram colors = {1: '#2E86AB', 2: '#A23B72'} for cluster_id in [1, 2]: cluster_data = X_lda[y == cluster_id] ax1.hist(cluster_data, bins=30, alpha=0.6, color=colors[cluster_id], label=f'Cluster {cluster_id}', edgecolor='white', linewidth=0.5) ax1.set_xlabel('Linear Discriminant (LD1)', fontsize=12) ax1.set_ylabel('Frequency', fontsize=12) ax1.set_title('Linear Discriminant Analysis: Cluster Separation\n(Maximum separation projection)', fontsize=14, fontweight='bold') ax1.legend(fontsize=11) ax1.grid(True, alpha=0.3, axis='y') # Strip plot - shows individual protocols for cluster_id in [1, 2]: cluster_data = X_lda[y == cluster_id] cluster_protocols = df_with_clusters[df_with_clusters['cluster'] == cluster_id]['Descriptor'].values # Add jitter for visibility y_jitter = np.random.normal(cluster_id, 0.1, size=len(cluster_data)) ax2.scatter(cluster_data, y_jitter, c=colors[cluster_id], alpha=0.5, s=40, edgecolors='white', linewidth=0.3) # Label a few representative protocols for i in range(0, len(cluster_data), 25): ax2.annotate(cluster_protocols[i], (cluster_data[i], y_jitter[i]), fontsize=7, alpha=0.7, xytext=(0, 5), textcoords='offset points', rotation=45, ha='left') ax2.set_xlabel('Linear Discriminant (LD1)', fontsize=12) ax2.set_ylabel('Cluster', fontsize=12) ax2.set_yticks([1, 2]) ax2.set_yticklabels(['Cluster 1:\nRelational/Cultural', 'Cluster 2:\nInstitutional/Bureaucratic']) ax2.set_title('Individual Protocols Projected onto Discriminant Axis', fontsize=12) ax2.grid(True, alpha=0.3, axis='x') plt.tight_layout() plt.savefig(plots_dir / 'lda_cluster_separation.png', dpi=300, bbox_inches='tight') print(f"Saved: {plots_dir / 'lda_cluster_separation.png'}") # Calculate separation metrics mean_1 = X_lda[y == 1].mean() mean_2 = X_lda[y == 2].mean() std_1 = X_lda[y == 1].std() std_2 = X_lda[y == 2].std() # Cohen's d (effect size) pooled_std = np.sqrt((std_1**2 + std_2**2) / 2) cohens_d = abs(mean_1 - mean_2) / pooled_std print(f"\n=== Cluster Separation Statistics ===") mean_1_val = mean_1[0] if isinstance(mean_1, np.ndarray) else mean_1 mean_2_val = mean_2[0] if isinstance(mean_2, np.ndarray) else mean_2 cohens_d_val = cohens_d[0] if isinstance(cohens_d, np.ndarray) else cohens_d print(f"Cluster 1 mean: {mean_1_val:.3f} (std: {std_1:.3f})") print(f"Cluster 2 mean: {mean_2_val:.3f} (std: {std_2:.3f})") print(f"Distance between means: {abs(mean_1_val - mean_2_val):.3f}") print(f"Cohen's d (effect size): {cohens_d_val:.3f}") print(f" (0.2=small, 0.5=medium, 0.8=large effect)") # Overlap percentage (rough estimate) overlap_start = max(X_lda[y == 1].min(), X_lda[y == 2].min()) overlap_end = min(X_lda[y == 1].max(), X_lda[y == 2].max()) overlap_range = overlap_end - overlap_start if overlap_end > overlap_start else 0 total_range = max(X_lda.max() - X_lda.min()) overlap_pct = (overlap_range / total_range) * 100 if overlap_range > 0 else 0 print(f"Approximate overlap: {overlap_pct:.1f}% of total range") # Save LDA projection data lda_df = pd.DataFrame({ 'Descriptor': df_with_clusters['Descriptor'], 'LD1': X_lda.flatten(), 'Cluster': y }) lda_df.to_csv(data_dir / 'lda_projection.csv', index=False) print(f"Saved: {data_dir / 'lda_projection.csv'}") print("\n=== Most discriminating dimensions ===") # The LDA coefficients tell us which dimensions contribute most to separation loadings = pd.DataFrame({ 'Dimension': dimension_cols, 'LDA_Coefficient': lda.coef_[0] }) loadings['Abs_Coefficient'] = loadings['LDA_Coefficient'].abs() loadings = loadings.sort_values('Abs_Coefficient', ascending=False) print("\nTop 10 dimensions that separate the clusters:") for _, row in loadings.head(10).iterrows(): print(f" {row['Dimension']}: {row['LDA_Coefficient']:.3f}") loadings.to_csv(data_dir / 'lda_coefficients.csv', index=False) print(f"\nSaved: {data_dir / 'lda_coefficients.csv'}")