Initial analysis complete
This commit is contained in:
827
analysis/multivariate_analysis.py
Executable file
827
analysis/multivariate_analysis.py
Executable file
@@ -0,0 +1,827 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Multivariate Analysis Script for Protocol Bicorder Data
|
||||
|
||||
Performs comprehensive multivariate statistical analyses on protocol diagnostic data,
|
||||
including clustering, dimensionality reduction, correlation analysis, and visualization.
|
||||
|
||||
Usage:
|
||||
python3 multivariate_analysis.py diagnostic_output.csv [--analyses all]
|
||||
python3 multivariate_analysis.py diagnostic_output.csv --analyses clustering pca
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
|
||||
from scipy.spatial.distance import pdist, squareform
|
||||
import networkx as nx
|
||||
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.decomposition import PCA, FactorAnalysis
|
||||
from sklearn.manifold import TSNE
|
||||
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
from sklearn.metrics import silhouette_score, davies_bouldin_score
|
||||
|
||||
try:
|
||||
import umap
|
||||
UMAP_AVAILABLE = True
|
||||
except ImportError:
|
||||
UMAP_AVAILABLE = False
|
||||
print("Note: UMAP not available. Install with: pip install umap-learn")
|
||||
|
||||
|
||||
class ProtocolAnalyzer:
|
||||
"""Main class for multivariate analysis of protocol data."""
|
||||
|
||||
def __init__(self, csv_path, output_dir='analysis_results'):
|
||||
"""Initialize analyzer with data and output directory."""
|
||||
self.csv_path = Path(csv_path)
|
||||
self.output_dir = Path(output_dir)
|
||||
self.output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Create subdirectories
|
||||
(self.output_dir / 'plots').mkdir(exist_ok=True)
|
||||
(self.output_dir / 'data').mkdir(exist_ok=True)
|
||||
(self.output_dir / 'reports').mkdir(exist_ok=True)
|
||||
|
||||
# Load and prepare data
|
||||
self.df = None
|
||||
self.dimension_cols = []
|
||||
self.design_cols = []
|
||||
self.entanglement_cols = []
|
||||
self.experience_cols = []
|
||||
self.scaled_data = None
|
||||
self.scaler = None
|
||||
|
||||
self._load_data()
|
||||
|
||||
def _load_data(self):
|
||||
"""Load CSV and identify dimension columns."""
|
||||
print(f"Loading data from {self.csv_path}...")
|
||||
self.df = pd.read_csv(self.csv_path)
|
||||
|
||||
# Identify dimension columns
|
||||
all_cols = self.df.columns.tolist()
|
||||
self.design_cols = [c for c in all_cols if c.startswith('Design_')]
|
||||
self.entanglement_cols = [c for c in all_cols if c.startswith('Entanglement_')]
|
||||
self.experience_cols = [c for c in all_cols if c.startswith('Experience_')]
|
||||
self.dimension_cols = self.design_cols + self.entanglement_cols + self.experience_cols
|
||||
|
||||
print(f"Loaded {len(self.df)} protocols with {len(self.dimension_cols)} dimensions")
|
||||
print(f" - Design: {len(self.design_cols)}")
|
||||
print(f" - Entanglement: {len(self.entanglement_cols)}")
|
||||
print(f" - Experience: {len(self.experience_cols)}")
|
||||
|
||||
# Check for missing values
|
||||
missing_count = self.df[self.dimension_cols].isna().sum().sum()
|
||||
rows_with_missing = self.df[self.dimension_cols].isna().any(axis=1).sum()
|
||||
|
||||
if missing_count > 0:
|
||||
print(f"\nWarning: Found {missing_count} missing values in {rows_with_missing} rows")
|
||||
print("Dropping rows with missing values...")
|
||||
self.df = self.df.dropna(subset=self.dimension_cols)
|
||||
print(f"Dataset now contains {len(self.df)} protocols")
|
||||
|
||||
# Standardize the dimension data
|
||||
self.scaler = StandardScaler()
|
||||
self.scaled_data = self.scaler.fit_transform(self.df[self.dimension_cols])
|
||||
|
||||
def save_results(self, data, filename, subdir='data'):
|
||||
"""Save results to CSV file."""
|
||||
output_path = self.output_dir / subdir / filename
|
||||
if isinstance(data, pd.DataFrame):
|
||||
data.to_csv(output_path, index=False)
|
||||
else:
|
||||
pd.DataFrame(data).to_csv(output_path)
|
||||
print(f" Saved: {output_path}")
|
||||
|
||||
def save_plot(self, filename):
|
||||
"""Save current matplotlib figure."""
|
||||
output_path = self.output_dir / 'plots' / filename
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_path, dpi=300, bbox_inches='tight')
|
||||
print(f" Saved: {output_path}")
|
||||
plt.close()
|
||||
|
||||
# ========== CLUSTERING ANALYSES ==========
|
||||
|
||||
def kmeans_clustering(self, n_clusters_range=(2, 10)):
|
||||
"""Perform K-means clustering with elbow method."""
|
||||
print("\n=== K-Means Clustering ===")
|
||||
|
||||
# Elbow method
|
||||
inertias = []
|
||||
silhouettes = []
|
||||
k_range = range(n_clusters_range[0], n_clusters_range[1] + 1)
|
||||
|
||||
for k in k_range:
|
||||
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
|
||||
labels = kmeans.fit_predict(self.scaled_data)
|
||||
inertias.append(kmeans.inertia_)
|
||||
if k > 1:
|
||||
silhouettes.append(silhouette_score(self.scaled_data, labels))
|
||||
else:
|
||||
silhouettes.append(0)
|
||||
|
||||
# Plot elbow curve
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
|
||||
|
||||
ax1.plot(k_range, inertias, 'bo-')
|
||||
ax1.set_xlabel('Number of Clusters (k)')
|
||||
ax1.set_ylabel('Inertia')
|
||||
ax1.set_title('Elbow Method for Optimal k')
|
||||
ax1.grid(True, alpha=0.3)
|
||||
|
||||
ax2.plot(k_range, silhouettes, 'ro-')
|
||||
ax2.set_xlabel('Number of Clusters (k)')
|
||||
ax2.set_ylabel('Silhouette Score')
|
||||
ax2.set_title('Silhouette Score by k')
|
||||
ax2.grid(True, alpha=0.3)
|
||||
|
||||
self.save_plot('kmeans_elbow.png')
|
||||
|
||||
# Use optimal k (highest silhouette)
|
||||
optimal_k = k_range[np.argmax(silhouettes)]
|
||||
print(f"Optimal k by silhouette score: {optimal_k}")
|
||||
|
||||
# Final clustering
|
||||
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
|
||||
self.df['kmeans_cluster'] = kmeans.fit_predict(self.scaled_data)
|
||||
|
||||
# Save results
|
||||
results = self.df[['Descriptor', 'kmeans_cluster']].copy()
|
||||
results['cluster'] = results['kmeans_cluster'] + 1 # 1-indexed for readability
|
||||
self.save_results(results[['Descriptor', 'cluster']], 'kmeans_clusters.csv')
|
||||
|
||||
# Cluster statistics
|
||||
print(f"\nCluster sizes:")
|
||||
print(self.df['kmeans_cluster'].value_counts().sort_index())
|
||||
|
||||
return self.df['kmeans_cluster']
|
||||
|
||||
def hierarchical_clustering(self, n_clusters=5, method='ward'):
|
||||
"""Perform hierarchical clustering with dendrogram."""
|
||||
print("\n=== Hierarchical Clustering ===")
|
||||
|
||||
# Compute linkage
|
||||
Z = linkage(self.scaled_data, method=method)
|
||||
|
||||
# Plot dendrogram
|
||||
plt.figure(figsize=(16, 8))
|
||||
dendrogram(Z, labels=self.df['Descriptor'].values, leaf_font_size=8)
|
||||
plt.title(f'Hierarchical Clustering Dendrogram ({method} linkage)')
|
||||
plt.xlabel('Protocol')
|
||||
plt.ylabel('Distance')
|
||||
plt.xticks(rotation=90)
|
||||
self.save_plot('hierarchical_dendrogram.png')
|
||||
|
||||
# Cut tree to get clusters
|
||||
self.df['hierarchical_cluster'] = fcluster(Z, n_clusters, criterion='maxclust')
|
||||
|
||||
# Save results
|
||||
results = self.df[['Descriptor', 'hierarchical_cluster']].copy()
|
||||
results.columns = ['Descriptor', 'cluster']
|
||||
self.save_results(results, 'hierarchical_clusters.csv')
|
||||
|
||||
print(f"\nCluster sizes:")
|
||||
print(self.df['hierarchical_cluster'].value_counts().sort_index())
|
||||
|
||||
return self.df['hierarchical_cluster']
|
||||
|
||||
def dbscan_clustering(self, eps=3.0, min_samples=3):
|
||||
"""Perform DBSCAN clustering to identify outliers."""
|
||||
print("\n=== DBSCAN Clustering ===")
|
||||
|
||||
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
|
||||
self.df['dbscan_cluster'] = dbscan.fit_predict(self.scaled_data)
|
||||
|
||||
n_clusters = len(set(self.df['dbscan_cluster'])) - (1 if -1 in self.df['dbscan_cluster'] else 0)
|
||||
n_outliers = (self.df['dbscan_cluster'] == -1).sum()
|
||||
|
||||
print(f"Found {n_clusters} clusters and {n_outliers} outliers")
|
||||
|
||||
# Save results
|
||||
results = self.df[['Descriptor', 'dbscan_cluster']].copy()
|
||||
results.columns = ['Descriptor', 'cluster']
|
||||
self.save_results(results, 'dbscan_clusters.csv')
|
||||
|
||||
if n_outliers > 0:
|
||||
outliers = self.df[self.df['dbscan_cluster'] == -1][['Descriptor']]
|
||||
self.save_results(outliers, 'dbscan_outliers.csv')
|
||||
print("\nOutlier protocols:")
|
||||
for protocol in outliers['Descriptor']:
|
||||
print(f" - {protocol}")
|
||||
|
||||
return self.df['dbscan_cluster']
|
||||
|
||||
# ========== DIMENSIONALITY REDUCTION ==========
|
||||
|
||||
def pca_analysis(self, n_components=None):
|
||||
"""Perform PCA and visualize results."""
|
||||
print("\n=== Principal Component Analysis ===")
|
||||
|
||||
# Fit PCA
|
||||
if n_components is None:
|
||||
pca = PCA()
|
||||
else:
|
||||
pca = PCA(n_components=n_components)
|
||||
|
||||
pca_coords = pca.fit_transform(self.scaled_data)
|
||||
|
||||
# Explained variance
|
||||
explained_var = pca.explained_variance_ratio_
|
||||
cumsum_var = np.cumsum(explained_var)
|
||||
|
||||
print(f"First 5 PCs explain {cumsum_var[4]*100:.1f}% of variance")
|
||||
|
||||
# Plot explained variance
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
|
||||
|
||||
n_show = min(15, len(explained_var))
|
||||
ax1.bar(range(1, n_show + 1), explained_var[:n_show])
|
||||
ax1.set_xlabel('Principal Component')
|
||||
ax1.set_ylabel('Explained Variance Ratio')
|
||||
ax1.set_title('Variance Explained by Each PC')
|
||||
ax1.grid(True, alpha=0.3, axis='y')
|
||||
|
||||
ax2.plot(range(1, n_show + 1), cumsum_var[:n_show], 'o-')
|
||||
ax2.axhline(y=0.8, color='r', linestyle='--', alpha=0.5, label='80% threshold')
|
||||
ax2.set_xlabel('Number of Components')
|
||||
ax2.set_ylabel('Cumulative Explained Variance')
|
||||
ax2.set_title('Cumulative Variance Explained')
|
||||
ax2.legend()
|
||||
ax2.grid(True, alpha=0.3)
|
||||
|
||||
self.save_plot('pca_variance.png')
|
||||
|
||||
# 2D visualization
|
||||
plt.figure(figsize=(12, 10))
|
||||
plt.scatter(pca_coords[:, 0], pca_coords[:, 1], alpha=0.6, s=50)
|
||||
|
||||
# Annotate points
|
||||
for i, protocol in enumerate(self.df['Descriptor']):
|
||||
if i % 3 == 0: # Label every 3rd point to avoid clutter
|
||||
plt.annotate(protocol, (pca_coords[i, 0], pca_coords[i, 1]),
|
||||
fontsize=6, alpha=0.7)
|
||||
|
||||
plt.xlabel(f'PC1 ({explained_var[0]*100:.1f}% variance)')
|
||||
plt.ylabel(f'PC2 ({explained_var[1]*100:.1f}% variance)')
|
||||
plt.title('Protocols in PCA Space (First 2 Components)')
|
||||
plt.grid(True, alpha=0.3)
|
||||
self.save_plot('pca_2d.png')
|
||||
|
||||
# Save PCA coordinates
|
||||
pca_df = pd.DataFrame(pca_coords[:, :5],
|
||||
columns=[f'PC{i+1}' for i in range(min(5, pca_coords.shape[1]))])
|
||||
pca_df.insert(0, 'Descriptor', self.df['Descriptor'])
|
||||
self.save_results(pca_df, 'pca_coordinates.csv')
|
||||
|
||||
# Component loadings
|
||||
loadings = pd.DataFrame(
|
||||
pca.components_[:5, :].T,
|
||||
columns=[f'PC{i+1}' for i in range(min(5, pca.components_.shape[0]))],
|
||||
index=self.dimension_cols
|
||||
)
|
||||
self.save_results(loadings, 'pca_loadings.csv')
|
||||
|
||||
# Plot loadings heatmap
|
||||
plt.figure(figsize=(10, 12))
|
||||
sns.heatmap(loadings, cmap='RdBu_r', center=0, cbar_kws={'label': 'Loading'})
|
||||
plt.title('PCA Component Loadings')
|
||||
plt.tight_layout()
|
||||
self.save_plot('pca_loadings_heatmap.png')
|
||||
|
||||
return pca_coords, pca
|
||||
|
||||
def tsne_analysis(self, perplexity=30, n_components=2):
|
||||
"""Perform t-SNE analysis."""
|
||||
print("\n=== t-SNE Analysis ===")
|
||||
|
||||
tsne = TSNE(n_components=n_components, perplexity=perplexity, random_state=42, max_iter=1000)
|
||||
tsne_coords = tsne.fit_transform(self.scaled_data)
|
||||
|
||||
# Plot
|
||||
plt.figure(figsize=(12, 10))
|
||||
plt.scatter(tsne_coords[:, 0], tsne_coords[:, 1], alpha=0.6, s=50)
|
||||
|
||||
# Annotate some points
|
||||
for i, protocol in enumerate(self.df['Descriptor']):
|
||||
if i % 4 == 0: # Label every 4th point
|
||||
plt.annotate(protocol, (tsne_coords[i, 0], tsne_coords[i, 1]),
|
||||
fontsize=6, alpha=0.7)
|
||||
|
||||
plt.xlabel('t-SNE Dimension 1')
|
||||
plt.ylabel('t-SNE Dimension 2')
|
||||
plt.title(f't-SNE Projection (perplexity={perplexity})')
|
||||
plt.grid(True, alpha=0.3)
|
||||
self.save_plot('tsne_2d.png')
|
||||
|
||||
# Save coordinates
|
||||
tsne_df = pd.DataFrame(tsne_coords, columns=['TSNE1', 'TSNE2'])
|
||||
tsne_df.insert(0, 'Descriptor', self.df['Descriptor'])
|
||||
self.save_results(tsne_df, 'tsne_coordinates.csv')
|
||||
|
||||
return tsne_coords
|
||||
|
||||
def umap_analysis(self, n_neighbors=15, min_dist=0.1, n_components=2):
|
||||
"""Perform UMAP analysis if available."""
|
||||
if not UMAP_AVAILABLE:
|
||||
print("\n=== UMAP Analysis ===")
|
||||
print("UMAP not available. Install with: pip install umap-learn")
|
||||
return None
|
||||
|
||||
print("\n=== UMAP Analysis ===")
|
||||
|
||||
reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist,
|
||||
n_components=n_components, random_state=42)
|
||||
umap_coords = reducer.fit_transform(self.scaled_data)
|
||||
|
||||
# Plot
|
||||
plt.figure(figsize=(12, 10))
|
||||
plt.scatter(umap_coords[:, 0], umap_coords[:, 1], alpha=0.6, s=50)
|
||||
|
||||
# Annotate some points
|
||||
for i, protocol in enumerate(self.df['Descriptor']):
|
||||
if i % 4 == 0:
|
||||
plt.annotate(protocol, (umap_coords[i, 0], umap_coords[i, 1]),
|
||||
fontsize=6, alpha=0.7)
|
||||
|
||||
plt.xlabel('UMAP Dimension 1')
|
||||
plt.ylabel('UMAP Dimension 2')
|
||||
plt.title(f'UMAP Projection (n_neighbors={n_neighbors}, min_dist={min_dist})')
|
||||
plt.grid(True, alpha=0.3)
|
||||
self.save_plot('umap_2d.png')
|
||||
|
||||
# Save coordinates
|
||||
umap_df = pd.DataFrame(umap_coords, columns=['UMAP1', 'UMAP2'])
|
||||
umap_df.insert(0, 'Descriptor', self.df['Descriptor'])
|
||||
self.save_results(umap_df, 'umap_coordinates.csv')
|
||||
|
||||
return umap_coords
|
||||
|
||||
def factor_analysis(self, n_factors=5):
|
||||
"""Perform factor analysis."""
|
||||
print("\n=== Factor Analysis ===")
|
||||
|
||||
fa = FactorAnalysis(n_components=n_factors, random_state=42)
|
||||
fa_coords = fa.fit_transform(self.scaled_data)
|
||||
|
||||
# Factor loadings
|
||||
loadings = pd.DataFrame(
|
||||
fa.components_.T,
|
||||
columns=[f'Factor{i+1}' for i in range(n_factors)],
|
||||
index=self.dimension_cols
|
||||
)
|
||||
self.save_results(loadings, 'factor_loadings.csv')
|
||||
|
||||
# Plot loadings heatmap
|
||||
plt.figure(figsize=(10, 12))
|
||||
sns.heatmap(loadings, cmap='RdBu_r', center=0, cbar_kws={'label': 'Loading'})
|
||||
plt.title('Factor Analysis Loadings')
|
||||
plt.tight_layout()
|
||||
self.save_plot('factor_loadings_heatmap.png')
|
||||
|
||||
# Save factor scores
|
||||
fa_df = pd.DataFrame(fa_coords,
|
||||
columns=[f'Factor{i+1}' for i in range(n_factors)])
|
||||
fa_df.insert(0, 'Descriptor', self.df['Descriptor'])
|
||||
self.save_results(fa_df, 'factor_scores.csv')
|
||||
|
||||
return fa_coords, fa
|
||||
|
||||
# ========== CORRELATION & STRUCTURE ==========
|
||||
|
||||
def correlation_analysis(self):
|
||||
"""Compute and visualize correlation matrices."""
|
||||
print("\n=== Correlation Analysis ===")
|
||||
|
||||
# Full correlation matrix
|
||||
corr_matrix = self.df[self.dimension_cols].corr()
|
||||
|
||||
# Plot full correlation heatmap
|
||||
plt.figure(figsize=(16, 14))
|
||||
sns.heatmap(corr_matrix, cmap='RdBu_r', center=0,
|
||||
square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'})
|
||||
plt.title('Correlation Matrix - All Dimensions')
|
||||
plt.tight_layout()
|
||||
self.save_plot('correlation_heatmap_full.png')
|
||||
|
||||
# Save correlation matrix
|
||||
self.save_results(corr_matrix, 'correlation_matrix.csv')
|
||||
|
||||
# Find strongest correlations
|
||||
corr_pairs = []
|
||||
for i in range(len(corr_matrix.columns)):
|
||||
for j in range(i+1, len(corr_matrix.columns)):
|
||||
corr_pairs.append({
|
||||
'Dimension1': corr_matrix.columns[i],
|
||||
'Dimension2': corr_matrix.columns[j],
|
||||
'Correlation': corr_matrix.iloc[i, j]
|
||||
})
|
||||
|
||||
corr_df = pd.DataFrame(corr_pairs).sort_values('Correlation',
|
||||
key=abs,
|
||||
ascending=False)
|
||||
self.save_results(corr_df.head(20), 'top_correlations.csv')
|
||||
|
||||
print("\nTop 5 positive correlations:")
|
||||
for _, row in corr_df.head(5).iterrows():
|
||||
print(f" {row['Dimension1']} <-> {row['Dimension2']}: {row['Correlation']:.3f}")
|
||||
|
||||
print("\nTop 5 negative correlations:")
|
||||
for _, row in corr_df.tail(5).iterrows():
|
||||
print(f" {row['Dimension1']} <-> {row['Dimension2']}: {row['Correlation']:.3f}")
|
||||
|
||||
# Within-category correlations
|
||||
self._plot_category_correlation('Design', self.design_cols)
|
||||
self._plot_category_correlation('Entanglement', self.entanglement_cols)
|
||||
self._plot_category_correlation('Experience', self.experience_cols)
|
||||
|
||||
return corr_matrix
|
||||
|
||||
def _plot_category_correlation(self, category_name, columns):
|
||||
"""Plot correlation heatmap for a specific category."""
|
||||
corr = self.df[columns].corr()
|
||||
|
||||
plt.figure(figsize=(10, 8))
|
||||
sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r', center=0,
|
||||
square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'})
|
||||
plt.title(f'{category_name} Dimensions - Correlation Matrix')
|
||||
plt.tight_layout()
|
||||
self.save_plot(f'correlation_heatmap_{category_name.lower()}.png')
|
||||
|
||||
def network_analysis(self, threshold=0.5):
|
||||
"""Create network graph of protocol similarities."""
|
||||
print("\n=== Network Analysis ===")
|
||||
|
||||
# Compute pairwise distances
|
||||
distances = pdist(self.scaled_data, metric='euclidean')
|
||||
dist_matrix = squareform(distances)
|
||||
|
||||
# Convert to similarity (inverse of distance, normalized)
|
||||
max_dist = dist_matrix.max()
|
||||
similarity_matrix = 1 - (dist_matrix / max_dist)
|
||||
|
||||
# Create network
|
||||
G = nx.Graph()
|
||||
|
||||
# Add nodes
|
||||
for i, protocol in enumerate(self.df['Descriptor']):
|
||||
G.add_node(i, label=protocol)
|
||||
|
||||
# Add edges above threshold
|
||||
edge_count = 0
|
||||
for i in range(len(similarity_matrix)):
|
||||
for j in range(i+1, len(similarity_matrix)):
|
||||
if similarity_matrix[i, j] > threshold:
|
||||
G.add_edge(i, j, weight=similarity_matrix[i, j])
|
||||
edge_count += 1
|
||||
|
||||
print(f"Network with {G.number_of_nodes()} nodes and {edge_count} edges")
|
||||
|
||||
# Calculate network metrics
|
||||
if G.number_of_edges() > 0:
|
||||
degree_centrality = nx.degree_centrality(G)
|
||||
betweenness = nx.betweenness_centrality(G)
|
||||
|
||||
metrics_df = pd.DataFrame({
|
||||
'Descriptor': [self.df.iloc[i]['Descriptor'] for i in G.nodes()],
|
||||
'Degree_Centrality': [degree_centrality[i] for i in G.nodes()],
|
||||
'Betweenness_Centrality': [betweenness[i] for i in G.nodes()]
|
||||
}).sort_values('Degree_Centrality', ascending=False)
|
||||
|
||||
self.save_results(metrics_df, 'network_metrics.csv')
|
||||
|
||||
print("\nTop 5 most central protocols:")
|
||||
for _, row in metrics_df.head(5).iterrows():
|
||||
print(f" {row['Descriptor']}: {row['Degree_Centrality']:.3f}")
|
||||
|
||||
# Plot network
|
||||
plt.figure(figsize=(16, 16))
|
||||
pos = nx.spring_layout(G, k=0.5, iterations=50, seed=42)
|
||||
|
||||
# Node sizes based on degree centrality
|
||||
node_sizes = [degree_centrality[i] * 3000 + 100 for i in G.nodes()]
|
||||
|
||||
nx.draw_networkx_nodes(G, pos, node_size=node_sizes,
|
||||
node_color='lightblue', alpha=0.7)
|
||||
nx.draw_networkx_edges(G, pos, alpha=0.2)
|
||||
|
||||
# Labels for high-centrality nodes
|
||||
high_centrality = {i: self.df.iloc[i]['Descriptor']
|
||||
for i in G.nodes() if degree_centrality[i] > 0.1}
|
||||
nx.draw_networkx_labels(G, pos, labels=high_centrality, font_size=8)
|
||||
|
||||
plt.title(f'Protocol Similarity Network (threshold={threshold})')
|
||||
plt.axis('off')
|
||||
plt.tight_layout()
|
||||
self.save_plot('network_graph.png')
|
||||
else:
|
||||
print("No edges above threshold - try lowering the threshold")
|
||||
|
||||
return G
|
||||
|
||||
# ========== CLASSIFICATION & PREDICTION ==========
|
||||
|
||||
def category_discriminant_analysis(self):
|
||||
"""Analyze how well dimension categories discriminate protocols."""
|
||||
print("\n=== Category Discriminant Analysis ===")
|
||||
|
||||
results = []
|
||||
|
||||
for category_name, columns in [('Design', self.design_cols),
|
||||
('Entanglement', self.entanglement_cols),
|
||||
('Experience', self.experience_cols)]:
|
||||
|
||||
# Use one category to predict clustering from another
|
||||
X = self.df[columns].values
|
||||
|
||||
# Use kmeans clusters as target if available
|
||||
if 'kmeans_cluster' in self.df.columns:
|
||||
y = self.df['kmeans_cluster'].values
|
||||
|
||||
# LDA
|
||||
try:
|
||||
lda = LinearDiscriminantAnalysis()
|
||||
lda.fit(X, y)
|
||||
score = lda.score(X, y)
|
||||
|
||||
results.append({
|
||||
'Category': category_name,
|
||||
'Accuracy': score,
|
||||
'N_Dimensions': len(columns)
|
||||
})
|
||||
|
||||
print(f"{category_name} dimensions predict clusters with {score*100:.1f}% accuracy")
|
||||
except:
|
||||
print(f"Could not perform LDA for {category_name}")
|
||||
|
||||
if results:
|
||||
results_df = pd.DataFrame(results)
|
||||
self.save_results(results_df, 'category_discriminant_results.csv')
|
||||
|
||||
return results
|
||||
|
||||
def feature_importance_analysis(self):
|
||||
"""Analyze which dimensions are most important for clustering."""
|
||||
print("\n=== Feature Importance Analysis ===")
|
||||
|
||||
if 'kmeans_cluster' not in self.df.columns:
|
||||
print("Run clustering first to enable feature importance analysis")
|
||||
return None
|
||||
|
||||
# Random Forest classifier
|
||||
X = self.df[self.dimension_cols].values
|
||||
y = self.df['kmeans_cluster'].values
|
||||
|
||||
rf = RandomForestClassifier(n_estimators=100, random_state=42)
|
||||
rf.fit(X, y)
|
||||
|
||||
# Feature importances
|
||||
importances = pd.DataFrame({
|
||||
'Dimension': self.dimension_cols,
|
||||
'Importance': rf.feature_importances_
|
||||
}).sort_values('Importance', ascending=False)
|
||||
|
||||
self.save_results(importances, 'feature_importances.csv')
|
||||
|
||||
# Plot top 20
|
||||
plt.figure(figsize=(10, 12))
|
||||
top_20 = importances.head(20)
|
||||
plt.barh(range(len(top_20)), top_20['Importance'])
|
||||
plt.yticks(range(len(top_20)), top_20['Dimension'])
|
||||
plt.xlabel('Importance')
|
||||
plt.title('Top 20 Most Important Dimensions for Clustering')
|
||||
plt.gca().invert_yaxis()
|
||||
plt.tight_layout()
|
||||
self.save_plot('feature_importances.png')
|
||||
|
||||
print("\nTop 10 most important dimensions:")
|
||||
for _, row in importances.head(10).iterrows():
|
||||
print(f" {row['Dimension']}: {row['Importance']:.4f}")
|
||||
|
||||
return importances
|
||||
|
||||
def analyst_comparison(self):
|
||||
"""Compare ratings across different analysts."""
|
||||
print("\n=== Analyst Comparison ===")
|
||||
|
||||
if 'analyst' not in self.df.columns:
|
||||
print("No analyst column found")
|
||||
return None
|
||||
|
||||
analysts = self.df['analyst'].unique()
|
||||
print(f"Found {len(analysts)} unique analysts")
|
||||
|
||||
# Mean ratings by analyst for each dimension
|
||||
analyst_means = self.df.groupby('analyst')[self.dimension_cols].mean()
|
||||
self.save_results(analyst_means, 'analyst_mean_ratings.csv')
|
||||
|
||||
# Plot comparison
|
||||
fig, axes = plt.subplots(3, 1, figsize=(14, 12))
|
||||
|
||||
for idx, (category_name, columns) in enumerate([
|
||||
('Design', self.design_cols),
|
||||
('Entanglement', self.entanglement_cols),
|
||||
('Experience', self.experience_cols)
|
||||
]):
|
||||
analyst_means[columns].T.plot(ax=axes[idx], marker='o')
|
||||
axes[idx].set_title(f'{category_name} Dimensions - Mean Ratings by Analyst')
|
||||
axes[idx].set_ylabel('Mean Rating')
|
||||
axes[idx].legend(title='Analyst', bbox_to_anchor=(1.05, 1), loc='upper left')
|
||||
axes[idx].grid(True, alpha=0.3)
|
||||
axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=45, ha='right')
|
||||
|
||||
plt.tight_layout()
|
||||
self.save_plot('analyst_comparison.png')
|
||||
|
||||
return analyst_means
|
||||
|
||||
# ========== SUMMARY REPORT ==========
|
||||
|
||||
def generate_summary_report(self):
|
||||
"""Generate a text summary of all analyses."""
|
||||
print("\n=== Generating Summary Report ===")
|
||||
|
||||
report_lines = []
|
||||
report_lines.append("=" * 80)
|
||||
report_lines.append("MULTIVARIATE ANALYSIS SUMMARY REPORT")
|
||||
report_lines.append("Protocol Bicorder Dataset")
|
||||
report_lines.append("=" * 80)
|
||||
report_lines.append("")
|
||||
|
||||
report_lines.append(f"Dataset: {self.csv_path}")
|
||||
report_lines.append(f"Number of protocols: {len(self.df)}")
|
||||
report_lines.append(f"Number of dimensions: {len(self.dimension_cols)}")
|
||||
report_lines.append(f" - Design: {len(self.design_cols)}")
|
||||
report_lines.append(f" - Entanglement: {len(self.entanglement_cols)}")
|
||||
report_lines.append(f" - Experience: {len(self.experience_cols)}")
|
||||
report_lines.append("")
|
||||
|
||||
report_lines.append("-" * 80)
|
||||
report_lines.append("ANALYSES PERFORMED")
|
||||
report_lines.append("-" * 80)
|
||||
|
||||
# Check which analyses were run
|
||||
analyses_run = []
|
||||
|
||||
if 'kmeans_cluster' in self.df.columns:
|
||||
analyses_run.append("- K-Means Clustering")
|
||||
report_lines.append(f"K-Means: {len(self.df['kmeans_cluster'].unique())} clusters identified")
|
||||
|
||||
if 'hierarchical_cluster' in self.df.columns:
|
||||
analyses_run.append("- Hierarchical Clustering")
|
||||
report_lines.append(f"Hierarchical: {len(self.df['hierarchical_cluster'].unique())} clusters")
|
||||
|
||||
if 'dbscan_cluster' in self.df.columns:
|
||||
analyses_run.append("- DBSCAN Clustering")
|
||||
n_outliers = (self.df['dbscan_cluster'] == -1).sum()
|
||||
report_lines.append(f"DBSCAN: {n_outliers} outlier protocols identified")
|
||||
|
||||
report_lines.append("")
|
||||
report_lines.append("Dimensionality Reduction:")
|
||||
report_lines.append("- Principal Component Analysis (PCA)")
|
||||
report_lines.append("- t-SNE Projection")
|
||||
if UMAP_AVAILABLE:
|
||||
report_lines.append("- UMAP Projection")
|
||||
report_lines.append("- Factor Analysis")
|
||||
report_lines.append("")
|
||||
|
||||
report_lines.append("Statistical Analyses:")
|
||||
report_lines.append("- Correlation Analysis")
|
||||
report_lines.append("- Network Analysis")
|
||||
report_lines.append("- Feature Importance Analysis")
|
||||
|
||||
if 'analyst' in self.df.columns:
|
||||
report_lines.append("- Analyst Comparison")
|
||||
|
||||
report_lines.append("")
|
||||
report_lines.append("-" * 80)
|
||||
report_lines.append("OUTPUT FILES")
|
||||
report_lines.append("-" * 80)
|
||||
report_lines.append(f"All results saved to: {self.output_dir}/")
|
||||
report_lines.append(" - plots/ : All visualizations (PNG)")
|
||||
report_lines.append(" - data/ : All numerical results (CSV)")
|
||||
report_lines.append(" - reports/ : This summary report")
|
||||
report_lines.append("")
|
||||
|
||||
report_lines.append("=" * 80)
|
||||
report_lines.append("END OF REPORT")
|
||||
report_lines.append("=" * 80)
|
||||
|
||||
report_text = "\n".join(report_lines)
|
||||
|
||||
# Save report
|
||||
report_path = self.output_dir / 'reports' / 'analysis_summary.txt'
|
||||
with open(report_path, 'w') as f:
|
||||
f.write(report_text)
|
||||
|
||||
print(f" Saved: {report_path}")
|
||||
print("\n" + report_text)
|
||||
|
||||
return report_text
|
||||
|
||||
|
||||
def main():
|
||||
"""Main execution function."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Multivariate analysis of Protocol Bicorder data',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python3 multivariate_analysis.py diagnostic_output.csv
|
||||
python3 multivariate_analysis.py diagnostic_output.csv --output results/
|
||||
python3 multivariate_analysis.py diagnostic_output.csv --analyses clustering pca
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('csv_file', help='Input CSV file with protocol diagnostics')
|
||||
parser.add_argument('--output', '-o', default='analysis_results',
|
||||
help='Output directory (default: analysis_results)')
|
||||
parser.add_argument('--analyses', nargs='+',
|
||||
choices=['clustering', 'pca', 'tsne', 'umap', 'factor',
|
||||
'correlation', 'network', 'importance', 'analyst', 'all'],
|
||||
default=['all'],
|
||||
help='Which analyses to run (default: all)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Check if file exists
|
||||
if not Path(args.csv_file).exists():
|
||||
print(f"Error: File not found: {args.csv_file}")
|
||||
sys.exit(1)
|
||||
|
||||
# Initialize analyzer
|
||||
print("=" * 80)
|
||||
print("PROTOCOL BICORDER - MULTIVARIATE ANALYSIS")
|
||||
print("=" * 80)
|
||||
|
||||
analyzer = ProtocolAnalyzer(args.csv_file, args.output)
|
||||
|
||||
# Determine which analyses to run
|
||||
run_all = 'all' in args.analyses
|
||||
|
||||
# Run analyses
|
||||
try:
|
||||
# Clustering
|
||||
if run_all or 'clustering' in args.analyses:
|
||||
analyzer.kmeans_clustering()
|
||||
analyzer.hierarchical_clustering()
|
||||
analyzer.dbscan_clustering()
|
||||
|
||||
# Dimensionality reduction
|
||||
if run_all or 'pca' in args.analyses:
|
||||
analyzer.pca_analysis()
|
||||
|
||||
if run_all or 'tsne' in args.analyses:
|
||||
analyzer.tsne_analysis()
|
||||
|
||||
if run_all or 'umap' in args.analyses:
|
||||
analyzer.umap_analysis()
|
||||
|
||||
if run_all or 'factor' in args.analyses:
|
||||
analyzer.factor_analysis()
|
||||
|
||||
# Correlation and structure
|
||||
if run_all or 'correlation' in args.analyses:
|
||||
analyzer.correlation_analysis()
|
||||
|
||||
if run_all or 'network' in args.analyses:
|
||||
analyzer.network_analysis(threshold=0.6)
|
||||
|
||||
# Classification
|
||||
if run_all or 'importance' in args.analyses:
|
||||
analyzer.category_discriminant_analysis()
|
||||
analyzer.feature_importance_analysis()
|
||||
|
||||
if run_all or 'analyst' in args.analyses:
|
||||
analyzer.analyst_comparison()
|
||||
|
||||
# Generate summary
|
||||
analyzer.generate_summary_report()
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("ANALYSIS COMPLETE!")
|
||||
print("=" * 80)
|
||||
print(f"\nAll results saved to: {analyzer.output_dir}/")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\nError during analysis: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user