Added classifer analysis to bicorder ascii and web app
This commit is contained in:
366
analysis/bicorder_classifier.py
Normal file
366
analysis/bicorder_classifier.py
Normal file
@@ -0,0 +1,366 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Bicorder Cluster Classifier
|
||||
|
||||
Provides real-time protocol classification and smart form recommendation
|
||||
based on the two-cluster analysis.
|
||||
|
||||
Usage:
|
||||
from bicorder_classifier import BicorderClassifier
|
||||
|
||||
classifier = BicorderClassifier()
|
||||
|
||||
# As user fills in dimensions
|
||||
ratings = {
|
||||
'Design_explicit_vs_implicit': 7,
|
||||
'Design_elite_vs_vernacular': 2,
|
||||
# ... etc
|
||||
}
|
||||
|
||||
result = classifier.predict(ratings)
|
||||
print(f"Cluster: {result['cluster']}")
|
||||
print(f"Confidence: {result['confidence']:.1%}")
|
||||
print(f"Recommend form: {result['recommended_form']}")
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class BicorderClassifier:
|
||||
"""
|
||||
Classifies protocols into one of two families and recommends form type.
|
||||
"""
|
||||
|
||||
# Dimension names (in order)
|
||||
DIMENSIONS = [
|
||||
'Design_explicit_vs_implicit',
|
||||
'Design_precise_vs_interpretive',
|
||||
'Design_elite_vs_vernacular',
|
||||
'Design_documenting_vs_enabling',
|
||||
'Design_static_vs_malleable',
|
||||
'Design_technical_vs_social',
|
||||
'Design_universal_vs_particular',
|
||||
'Design_durable_vs_ephemeral',
|
||||
'Entanglement_macro_vs_micro',
|
||||
'Entanglement_sovereign_vs_subsidiary',
|
||||
'Entanglement_self-enforcing_vs_enforced',
|
||||
'Entanglement_abstract_vs_embodied',
|
||||
'Entanglement_obligatory_vs_voluntary',
|
||||
'Entanglement_flocking_vs_swarming',
|
||||
'Entanglement_defensible_vs_exposed',
|
||||
'Entanglement_exclusive_vs_non-exclusive',
|
||||
'Experience_sufficient_vs_insufficient',
|
||||
'Experience_crystallized_vs_contested',
|
||||
'Experience_trust-evading_vs_trust-inducing',
|
||||
'Experience_predictable_vs_emergent',
|
||||
'Experience_exclusion_vs_inclusion',
|
||||
'Experience_Kafka_vs_Whitehead',
|
||||
'Experience_dead_vs_alive',
|
||||
]
|
||||
|
||||
# Cluster names
|
||||
CLUSTER_NAMES = {
|
||||
1: "Relational/Cultural",
|
||||
2: "Institutional/Bureaucratic"
|
||||
}
|
||||
|
||||
# Key dimensions for short form (most discriminative)
|
||||
# Based on LDA analysis - top differentiating dimensions
|
||||
KEY_DIMENSIONS = [
|
||||
'Design_elite_vs_vernacular', # 4.602 difference
|
||||
'Entanglement_flocking_vs_swarming', # 4.079 difference
|
||||
'Design_static_vs_malleable', # 3.775 difference
|
||||
'Entanglement_obligatory_vs_voluntary', # 3.648 difference
|
||||
'Entanglement_self-enforcing_vs_enforced', # 3.628 difference
|
||||
'Design_explicit_vs_implicit', # High importance
|
||||
'Entanglement_sovereign_vs_subsidiary', # High importance
|
||||
'Design_technical_vs_social', # High importance
|
||||
]
|
||||
|
||||
def __init__(self, model_path='analysis_results/data'):
|
||||
"""Initialize classifier with pre-computed model data."""
|
||||
self.model_path = Path(model_path)
|
||||
self.scaler = StandardScaler()
|
||||
self.lda = None
|
||||
self.cluster_centroids = None
|
||||
|
||||
# Load training data to fit scaler and LDA
|
||||
self._load_model()
|
||||
|
||||
def _load_model(self):
|
||||
"""Load and fit the classification model from analysis results."""
|
||||
# Load the original data and cluster assignments
|
||||
df = pd.read_csv('diagnostic_output.csv')
|
||||
clusters = pd.read_csv(self.model_path / 'kmeans_clusters.csv')
|
||||
|
||||
# Remove duplicates
|
||||
df = df.drop_duplicates(subset='Descriptor', keep='first')
|
||||
|
||||
# Merge and clean
|
||||
merged = df.merge(clusters, on='Descriptor')
|
||||
merged_clean = merged.dropna(subset=self.DIMENSIONS)
|
||||
|
||||
# Prepare training data
|
||||
X = merged_clean[self.DIMENSIONS].values
|
||||
y = merged_clean['cluster'].values
|
||||
|
||||
# Fit scaler
|
||||
self.scaler.fit(X)
|
||||
X_scaled = self.scaler.transform(X)
|
||||
|
||||
# Fit LDA
|
||||
self.lda = LinearDiscriminantAnalysis(n_components=1)
|
||||
self.lda.fit(X_scaled, y)
|
||||
|
||||
# Calculate cluster centroids in scaled space
|
||||
self.cluster_centroids = {}
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = X_scaled[y == cluster_id]
|
||||
self.cluster_centroids[cluster_id] = cluster_data.mean(axis=0)
|
||||
|
||||
def predict(self, ratings, return_details=True):
|
||||
"""
|
||||
Predict cluster for given ratings.
|
||||
|
||||
Args:
|
||||
ratings: Dict mapping dimension names to values (1-9)
|
||||
Can be partial - missing dimensions are filled with median
|
||||
return_details: If True, returns detailed information
|
||||
|
||||
Returns:
|
||||
Dict with:
|
||||
- cluster: Predicted cluster number (1 or 2)
|
||||
- cluster_name: Human-readable cluster name
|
||||
- confidence: Confidence score (0-1)
|
||||
- completeness: Fraction of dimensions provided (0-1)
|
||||
- recommended_form: 'short' or 'long'
|
||||
- distance_to_boundary: How far from cluster boundary
|
||||
- lda_score: Score on the discriminant axis
|
||||
"""
|
||||
# Convert ratings to full vector
|
||||
X = np.full(len(self.DIMENSIONS), np.nan)
|
||||
provided_count = 0
|
||||
|
||||
for i, dim in enumerate(self.DIMENSIONS):
|
||||
if dim in ratings:
|
||||
X[i] = ratings[dim]
|
||||
provided_count += 1
|
||||
|
||||
completeness = provided_count / len(self.DIMENSIONS)
|
||||
|
||||
# Fill missing values with median (5 - middle of 1-9 scale)
|
||||
X[np.isnan(X)] = 5.0
|
||||
|
||||
# Scale
|
||||
X_scaled = self.scaler.transform(X.reshape(1, -1))
|
||||
|
||||
# Predict cluster
|
||||
cluster = self.lda.predict(X_scaled)[0]
|
||||
|
||||
# Get LDA score (position on discriminant axis)
|
||||
lda_score = self.lda.decision_function(X_scaled)[0]
|
||||
|
||||
# Calculate confidence based on distance from decision boundary
|
||||
# LDA decision boundary is at 0
|
||||
distance_to_boundary = abs(lda_score)
|
||||
|
||||
# Confidence: higher when further from boundary
|
||||
# Normalize based on observed data range
|
||||
confidence = min(1.0, distance_to_boundary / 3.0) # 3.0 is typical strong separation
|
||||
|
||||
# Adjust confidence based on completeness
|
||||
adjusted_confidence = confidence * (0.5 + 0.5 * completeness)
|
||||
|
||||
# Recommend form
|
||||
# Use long form when:
|
||||
# 1. Low confidence (< 0.6)
|
||||
# 2. Low completeness (< 0.5 of dimensions provided)
|
||||
# 3. Near boundary (< 0.5 distance)
|
||||
if adjusted_confidence < 0.6 or completeness < 0.5 or distance_to_boundary < 0.5:
|
||||
recommended_form = 'long'
|
||||
else:
|
||||
recommended_form = 'short'
|
||||
|
||||
if not return_details:
|
||||
return {
|
||||
'cluster': int(cluster),
|
||||
'cluster_name': self.CLUSTER_NAMES[cluster],
|
||||
'confidence': float(adjusted_confidence),
|
||||
'recommended_form': recommended_form
|
||||
}
|
||||
|
||||
# Calculate distances to each centroid
|
||||
distances = {}
|
||||
for cluster_id, centroid in self.cluster_centroids.items():
|
||||
dist = np.linalg.norm(X_scaled - centroid)
|
||||
distances[cluster_id] = float(dist)
|
||||
|
||||
return {
|
||||
'cluster': int(cluster),
|
||||
'cluster_name': self.CLUSTER_NAMES[cluster],
|
||||
'confidence': float(adjusted_confidence),
|
||||
'completeness': float(completeness),
|
||||
'dimensions_provided': provided_count,
|
||||
'dimensions_total': len(self.DIMENSIONS),
|
||||
'recommended_form': recommended_form,
|
||||
'distance_to_boundary': float(distance_to_boundary),
|
||||
'lda_score': float(lda_score),
|
||||
'distances_to_centroids': distances,
|
||||
'key_dimensions_provided': sum(1 for dim in self.KEY_DIMENSIONS if dim in ratings),
|
||||
'key_dimensions_total': len(self.KEY_DIMENSIONS),
|
||||
}
|
||||
|
||||
def get_key_dimensions(self):
|
||||
"""Return the most important dimensions for classification."""
|
||||
return self.KEY_DIMENSIONS.copy()
|
||||
|
||||
def get_short_form_dimensions(self):
|
||||
"""Return recommended dimensions for short form."""
|
||||
return self.KEY_DIMENSIONS
|
||||
|
||||
def explain_classification(self, ratings):
|
||||
"""
|
||||
Provide human-readable explanation of classification.
|
||||
|
||||
Args:
|
||||
ratings: Dict mapping dimension names to values
|
||||
|
||||
Returns:
|
||||
String explanation
|
||||
"""
|
||||
result = self.predict(ratings, return_details=True)
|
||||
|
||||
explanation = []
|
||||
explanation.append(f"Protocol Classification: {result['cluster_name']}")
|
||||
explanation.append(f"Confidence: {result['confidence']:.0%}")
|
||||
explanation.append(f"")
|
||||
|
||||
if result['lda_score'] > 0:
|
||||
explanation.append(f"This protocol leans toward Institutional/Bureaucratic characteristics:")
|
||||
explanation.append(f" - More likely to be formal, standardized, top-down")
|
||||
explanation.append(f" - May involve state/corporate enforcement")
|
||||
explanation.append(f" - Tends toward precise, documented procedures")
|
||||
else:
|
||||
explanation.append(f"This protocol leans toward Relational/Cultural characteristics:")
|
||||
explanation.append(f" - More likely to be emergent, community-based")
|
||||
explanation.append(f" - May involve voluntary participation")
|
||||
explanation.append(f" - Tends toward interpretive, flexible practices")
|
||||
|
||||
explanation.append(f"")
|
||||
explanation.append(f"Distance from boundary: {result['distance_to_boundary']:.2f}")
|
||||
|
||||
if result['distance_to_boundary'] < 0.5:
|
||||
explanation.append(f"⚠️ This protocol is near the boundary between families.")
|
||||
explanation.append(f" It may exhibit characteristics of both types.")
|
||||
|
||||
explanation.append(f"")
|
||||
explanation.append(f"Completeness: {result['completeness']:.0%} ({result['dimensions_provided']}/{result['dimensions_total']} dimensions)")
|
||||
|
||||
if result['completeness'] < 1.0:
|
||||
explanation.append(f"Note: Missing dimensions filled with neutral values (5)")
|
||||
explanation.append(f" Confidence improves with complete data")
|
||||
|
||||
explanation.append(f"")
|
||||
explanation.append(f"Recommended form: {result['recommended_form'].upper()}")
|
||||
|
||||
if result['recommended_form'] == 'long':
|
||||
explanation.append(f"Reason: Use long form for:")
|
||||
if result['confidence'] < 0.6:
|
||||
explanation.append(f" - Low classification confidence")
|
||||
if result['completeness'] < 0.5:
|
||||
explanation.append(f" - Incomplete data")
|
||||
if result['distance_to_boundary'] < 0.5:
|
||||
explanation.append(f" - Ambiguous positioning between families")
|
||||
else:
|
||||
explanation.append(f"Reason: High confidence classification with {result['completeness']:.0%} data")
|
||||
|
||||
return "\n".join(explanation)
|
||||
|
||||
def save_model(self, output_path='bicorder_classifier_model.json'):
|
||||
"""Save model parameters for use without scikit-learn."""
|
||||
model_data = {
|
||||
'dimensions': self.DIMENSIONS,
|
||||
'key_dimensions': self.KEY_DIMENSIONS,
|
||||
'cluster_names': self.CLUSTER_NAMES,
|
||||
'scaler_mean': self.scaler.mean_.tolist(),
|
||||
'scaler_std': self.scaler.scale_.tolist(),
|
||||
'lda_coef': self.lda.coef_.tolist(),
|
||||
'lda_intercept': self.lda.intercept_.tolist(),
|
||||
'cluster_centroids': {
|
||||
str(k): v.tolist() for k, v in self.cluster_centroids.items()
|
||||
}
|
||||
}
|
||||
|
||||
with open(output_path, 'w') as f:
|
||||
json.dump(model_data, f, indent=2)
|
||||
|
||||
print(f"Model saved to {output_path}")
|
||||
return output_path
|
||||
|
||||
|
||||
def main():
|
||||
"""Demo usage of the classifier."""
|
||||
print("=" * 80)
|
||||
print("BICORDER CLUSTER CLASSIFIER - DEMO")
|
||||
print("=" * 80)
|
||||
|
||||
classifier = BicorderClassifier()
|
||||
|
||||
# Example 1: Relational/Cultural protocol (e.g., Indigenous knowledge sharing)
|
||||
print("\nExample 1: Community-Based Protocol")
|
||||
print("-" * 80)
|
||||
ratings_relational = {
|
||||
'Design_elite_vs_vernacular': 9, # Very vernacular
|
||||
'Design_explicit_vs_implicit': 8, # More implicit
|
||||
'Entanglement_flocking_vs_swarming': 9, # Swarming
|
||||
'Entanglement_obligatory_vs_voluntary': 9, # Voluntary
|
||||
'Design_static_vs_malleable': 8, # Malleable
|
||||
'Design_technical_vs_social': 9, # Social
|
||||
}
|
||||
|
||||
print(classifier.explain_classification(ratings_relational))
|
||||
|
||||
# Example 2: Institutional protocol (e.g., Airport security)
|
||||
print("\n\n" + "=" * 80)
|
||||
print("Example 2: Institutional Protocol")
|
||||
print("-" * 80)
|
||||
ratings_institutional = {
|
||||
'Design_elite_vs_vernacular': 1, # Elite
|
||||
'Design_explicit_vs_implicit': 1, # Very explicit
|
||||
'Entanglement_flocking_vs_swarming': 1, # Flocking
|
||||
'Entanglement_obligatory_vs_voluntary': 1, # Obligatory
|
||||
'Design_static_vs_malleable': 2, # Static
|
||||
'Design_technical_vs_social': 2, # Technical
|
||||
'Entanglement_sovereign_vs_subsidiary': 1, # Sovereign
|
||||
}
|
||||
|
||||
print(classifier.explain_classification(ratings_institutional))
|
||||
|
||||
# Example 3: Ambiguous/boundary protocol
|
||||
print("\n\n" + "=" * 80)
|
||||
print("Example 3: Boundary Protocol (mixed characteristics)")
|
||||
print("-" * 80)
|
||||
ratings_boundary = {
|
||||
'Design_elite_vs_vernacular': 5, # Middle
|
||||
'Design_explicit_vs_implicit': 4, # Slightly implicit
|
||||
'Entanglement_flocking_vs_swarming': 5, # Middle
|
||||
'Entanglement_obligatory_vs_voluntary': 6, # Slightly voluntary
|
||||
}
|
||||
|
||||
print(classifier.explain_classification(ratings_boundary))
|
||||
|
||||
# Save model
|
||||
print("\n\n" + "=" * 80)
|
||||
classifier.save_model()
|
||||
print("\nKey dimensions for short form:")
|
||||
for dim in classifier.get_key_dimensions():
|
||||
print(f" - {dim}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user