Files
protocol-bicorder/analysis/bicorder_classifier.py
2025-12-21 21:38:39 -07:00

367 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Bicorder Cluster Classifier
Provides real-time protocol classification and smart form recommendation
based on the two-cluster analysis.
Usage:
from bicorder_classifier import BicorderClassifier
classifier = BicorderClassifier()
# As user fills in dimensions
ratings = {
'Design_explicit_vs_implicit': 7,
'Design_elite_vs_vernacular': 2,
# ... etc
}
result = classifier.predict(ratings)
print(f"Cluster: {result['cluster']}")
print(f"Confidence: {result['confidence']:.1%}")
print(f"Recommend form: {result['recommended_form']}")
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import json
from pathlib import Path
class BicorderClassifier:
"""
Classifies protocols into one of two families and recommends form type.
"""
# Dimension names (in order)
DIMENSIONS = [
'Design_explicit_vs_implicit',
'Design_precise_vs_interpretive',
'Design_elite_vs_vernacular',
'Design_documenting_vs_enabling',
'Design_static_vs_malleable',
'Design_technical_vs_social',
'Design_universal_vs_particular',
'Design_durable_vs_ephemeral',
'Entanglement_macro_vs_micro',
'Entanglement_sovereign_vs_subsidiary',
'Entanglement_self-enforcing_vs_enforced',
'Entanglement_abstract_vs_embodied',
'Entanglement_obligatory_vs_voluntary',
'Entanglement_flocking_vs_swarming',
'Entanglement_defensible_vs_exposed',
'Entanglement_exclusive_vs_non-exclusive',
'Experience_sufficient_vs_insufficient',
'Experience_crystallized_vs_contested',
'Experience_trust-evading_vs_trust-inducing',
'Experience_predictable_vs_emergent',
'Experience_exclusion_vs_inclusion',
'Experience_Kafka_vs_Whitehead',
'Experience_dead_vs_alive',
]
# Cluster names
CLUSTER_NAMES = {
1: "Relational/Cultural",
2: "Institutional/Bureaucratic"
}
# Key dimensions for short form (most discriminative)
# Based on LDA analysis - top differentiating dimensions
KEY_DIMENSIONS = [
'Design_elite_vs_vernacular', # 4.602 difference
'Entanglement_flocking_vs_swarming', # 4.079 difference
'Design_static_vs_malleable', # 3.775 difference
'Entanglement_obligatory_vs_voluntary', # 3.648 difference
'Entanglement_self-enforcing_vs_enforced', # 3.628 difference
'Design_explicit_vs_implicit', # High importance
'Entanglement_sovereign_vs_subsidiary', # High importance
'Design_technical_vs_social', # High importance
]
def __init__(self, model_path='analysis_results/data'):
"""Initialize classifier with pre-computed model data."""
self.model_path = Path(model_path)
self.scaler = StandardScaler()
self.lda = None
self.cluster_centroids = None
# Load training data to fit scaler and LDA
self._load_model()
def _load_model(self):
"""Load and fit the classification model from analysis results."""
# Load the original data and cluster assignments
df = pd.read_csv('diagnostic_output.csv')
clusters = pd.read_csv(self.model_path / 'kmeans_clusters.csv')
# Remove duplicates
df = df.drop_duplicates(subset='Descriptor', keep='first')
# Merge and clean
merged = df.merge(clusters, on='Descriptor')
merged_clean = merged.dropna(subset=self.DIMENSIONS)
# Prepare training data
X = merged_clean[self.DIMENSIONS].values
y = merged_clean['cluster'].values
# Fit scaler
self.scaler.fit(X)
X_scaled = self.scaler.transform(X)
# Fit LDA
self.lda = LinearDiscriminantAnalysis(n_components=1)
self.lda.fit(X_scaled, y)
# Calculate cluster centroids in scaled space
self.cluster_centroids = {}
for cluster_id in [1, 2]:
cluster_data = X_scaled[y == cluster_id]
self.cluster_centroids[cluster_id] = cluster_data.mean(axis=0)
def predict(self, ratings, return_details=True):
"""
Predict cluster for given ratings.
Args:
ratings: Dict mapping dimension names to values (1-9)
Can be partial - missing dimensions are filled with median
return_details: If True, returns detailed information
Returns:
Dict with:
- cluster: Predicted cluster number (1 or 2)
- cluster_name: Human-readable cluster name
- confidence: Confidence score (0-1)
- completeness: Fraction of dimensions provided (0-1)
- recommended_form: 'short' or 'long'
- distance_to_boundary: How far from cluster boundary
- lda_score: Score on the discriminant axis
"""
# Convert ratings to full vector
X = np.full(len(self.DIMENSIONS), np.nan)
provided_count = 0
for i, dim in enumerate(self.DIMENSIONS):
if dim in ratings:
X[i] = ratings[dim]
provided_count += 1
completeness = provided_count / len(self.DIMENSIONS)
# Fill missing values with median (5 - middle of 1-9 scale)
X[np.isnan(X)] = 5.0
# Scale
X_scaled = self.scaler.transform(X.reshape(1, -1))
# Predict cluster
cluster = self.lda.predict(X_scaled)[0]
# Get LDA score (position on discriminant axis)
lda_score = self.lda.decision_function(X_scaled)[0]
# Calculate confidence based on distance from decision boundary
# LDA decision boundary is at 0
distance_to_boundary = abs(lda_score)
# Confidence: higher when further from boundary
# Normalize based on observed data range
confidence = min(1.0, distance_to_boundary / 3.0) # 3.0 is typical strong separation
# Adjust confidence based on completeness
adjusted_confidence = confidence * (0.5 + 0.5 * completeness)
# Recommend form
# Use long form when:
# 1. Low confidence (< 0.6)
# 2. Low completeness (< 0.5 of dimensions provided)
# 3. Near boundary (< 0.5 distance)
if adjusted_confidence < 0.6 or completeness < 0.5 or distance_to_boundary < 0.5:
recommended_form = 'long'
else:
recommended_form = 'short'
if not return_details:
return {
'cluster': int(cluster),
'cluster_name': self.CLUSTER_NAMES[cluster],
'confidence': float(adjusted_confidence),
'recommended_form': recommended_form
}
# Calculate distances to each centroid
distances = {}
for cluster_id, centroid in self.cluster_centroids.items():
dist = np.linalg.norm(X_scaled - centroid)
distances[cluster_id] = float(dist)
return {
'cluster': int(cluster),
'cluster_name': self.CLUSTER_NAMES[cluster],
'confidence': float(adjusted_confidence),
'completeness': float(completeness),
'dimensions_provided': provided_count,
'dimensions_total': len(self.DIMENSIONS),
'recommended_form': recommended_form,
'distance_to_boundary': float(distance_to_boundary),
'lda_score': float(lda_score),
'distances_to_centroids': distances,
'key_dimensions_provided': sum(1 for dim in self.KEY_DIMENSIONS if dim in ratings),
'key_dimensions_total': len(self.KEY_DIMENSIONS),
}
def get_key_dimensions(self):
"""Return the most important dimensions for classification."""
return self.KEY_DIMENSIONS.copy()
def get_short_form_dimensions(self):
"""Return recommended dimensions for short form."""
return self.KEY_DIMENSIONS
def explain_classification(self, ratings):
"""
Provide human-readable explanation of classification.
Args:
ratings: Dict mapping dimension names to values
Returns:
String explanation
"""
result = self.predict(ratings, return_details=True)
explanation = []
explanation.append(f"Protocol Classification: {result['cluster_name']}")
explanation.append(f"Confidence: {result['confidence']:.0%}")
explanation.append(f"")
if result['lda_score'] > 0:
explanation.append(f"This protocol leans toward Institutional/Bureaucratic characteristics:")
explanation.append(f" - More likely to be formal, standardized, top-down")
explanation.append(f" - May involve state/corporate enforcement")
explanation.append(f" - Tends toward precise, documented procedures")
else:
explanation.append(f"This protocol leans toward Relational/Cultural characteristics:")
explanation.append(f" - More likely to be emergent, community-based")
explanation.append(f" - May involve voluntary participation")
explanation.append(f" - Tends toward interpretive, flexible practices")
explanation.append(f"")
explanation.append(f"Distance from boundary: {result['distance_to_boundary']:.2f}")
if result['distance_to_boundary'] < 0.5:
explanation.append(f"⚠️ This protocol is near the boundary between families.")
explanation.append(f" It may exhibit characteristics of both types.")
explanation.append(f"")
explanation.append(f"Completeness: {result['completeness']:.0%} ({result['dimensions_provided']}/{result['dimensions_total']} dimensions)")
if result['completeness'] < 1.0:
explanation.append(f"Note: Missing dimensions filled with neutral values (5)")
explanation.append(f" Confidence improves with complete data")
explanation.append(f"")
explanation.append(f"Recommended form: {result['recommended_form'].upper()}")
if result['recommended_form'] == 'long':
explanation.append(f"Reason: Use long form for:")
if result['confidence'] < 0.6:
explanation.append(f" - Low classification confidence")
if result['completeness'] < 0.5:
explanation.append(f" - Incomplete data")
if result['distance_to_boundary'] < 0.5:
explanation.append(f" - Ambiguous positioning between families")
else:
explanation.append(f"Reason: High confidence classification with {result['completeness']:.0%} data")
return "\n".join(explanation)
def save_model(self, output_path='bicorder_classifier_model.json'):
"""Save model parameters for use without scikit-learn."""
model_data = {
'dimensions': self.DIMENSIONS,
'key_dimensions': self.KEY_DIMENSIONS,
'cluster_names': self.CLUSTER_NAMES,
'scaler_mean': self.scaler.mean_.tolist(),
'scaler_std': self.scaler.scale_.tolist(),
'lda_coef': self.lda.coef_.tolist(),
'lda_intercept': self.lda.intercept_.tolist(),
'cluster_centroids': {
str(k): v.tolist() for k, v in self.cluster_centroids.items()
}
}
with open(output_path, 'w') as f:
json.dump(model_data, f, indent=2)
print(f"Model saved to {output_path}")
return output_path
def main():
"""Demo usage of the classifier."""
print("=" * 80)
print("BICORDER CLUSTER CLASSIFIER - DEMO")
print("=" * 80)
classifier = BicorderClassifier()
# Example 1: Relational/Cultural protocol (e.g., Indigenous knowledge sharing)
print("\nExample 1: Community-Based Protocol")
print("-" * 80)
ratings_relational = {
'Design_elite_vs_vernacular': 9, # Very vernacular
'Design_explicit_vs_implicit': 8, # More implicit
'Entanglement_flocking_vs_swarming': 9, # Swarming
'Entanglement_obligatory_vs_voluntary': 9, # Voluntary
'Design_static_vs_malleable': 8, # Malleable
'Design_technical_vs_social': 9, # Social
}
print(classifier.explain_classification(ratings_relational))
# Example 2: Institutional protocol (e.g., Airport security)
print("\n\n" + "=" * 80)
print("Example 2: Institutional Protocol")
print("-" * 80)
ratings_institutional = {
'Design_elite_vs_vernacular': 1, # Elite
'Design_explicit_vs_implicit': 1, # Very explicit
'Entanglement_flocking_vs_swarming': 1, # Flocking
'Entanglement_obligatory_vs_voluntary': 1, # Obligatory
'Design_static_vs_malleable': 2, # Static
'Design_technical_vs_social': 2, # Technical
'Entanglement_sovereign_vs_subsidiary': 1, # Sovereign
}
print(classifier.explain_classification(ratings_institutional))
# Example 3: Ambiguous/boundary protocol
print("\n\n" + "=" * 80)
print("Example 3: Boundary Protocol (mixed characteristics)")
print("-" * 80)
ratings_boundary = {
'Design_elite_vs_vernacular': 5, # Middle
'Design_explicit_vs_implicit': 4, # Slightly implicit
'Entanglement_flocking_vs_swarming': 5, # Middle
'Entanglement_obligatory_vs_voluntary': 6, # Slightly voluntary
}
print(classifier.explain_classification(ratings_boundary))
# Save model
print("\n\n" + "=" * 80)
classifier.save_model()
print("\nKey dimensions for short form:")
for dim in classifier.get_key_dimensions():
print(f" - {dim}")
if __name__ == '__main__':
main()