#!/usr/bin/env python3 """ Bicorder Cluster Classifier Provides real-time protocol classification and smart form recommendation based on the two-cluster analysis. Usage: from bicorder_classifier import BicorderClassifier classifier = BicorderClassifier() # As user fills in dimensions ratings = { 'Design_explicit_vs_implicit': 7, 'Design_elite_vs_vernacular': 2, # ... etc } result = classifier.predict(ratings) print(f"Cluster: {result['cluster']}") print(f"Confidence: {result['confidence']:.1%}") print(f"Recommend form: {result['recommended_form']}") """ import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.discriminant_analysis import LinearDiscriminantAnalysis import json from pathlib import Path class BicorderClassifier: """ Classifies protocols into one of two families and recommends form type. """ # Dimension names (in order) DIMENSIONS = [ 'Design_explicit_vs_implicit', 'Design_precise_vs_interpretive', 'Design_elite_vs_vernacular', 'Design_documenting_vs_enabling', 'Design_static_vs_malleable', 'Design_technical_vs_social', 'Design_universal_vs_particular', 'Design_durable_vs_ephemeral', 'Entanglement_macro_vs_micro', 'Entanglement_sovereign_vs_subsidiary', 'Entanglement_self-enforcing_vs_enforced', 'Entanglement_abstract_vs_embodied', 'Entanglement_obligatory_vs_voluntary', 'Entanglement_flocking_vs_swarming', 'Entanglement_defensible_vs_exposed', 'Entanglement_exclusive_vs_non-exclusive', 'Experience_sufficient_vs_insufficient', 'Experience_crystallized_vs_contested', 'Experience_trust-evading_vs_trust-inducing', 'Experience_predictable_vs_emergent', 'Experience_exclusion_vs_inclusion', 'Experience_Kafka_vs_Whitehead', 'Experience_dead_vs_alive', ] # Cluster names CLUSTER_NAMES = { 1: "Relational/Cultural", 2: "Institutional/Bureaucratic" } # Key dimensions for short form (most discriminative) # Based on LDA analysis - top differentiating dimensions KEY_DIMENSIONS = [ 'Design_elite_vs_vernacular', # 4.602 difference 'Entanglement_flocking_vs_swarming', # 4.079 difference 'Design_static_vs_malleable', # 3.775 difference 'Entanglement_obligatory_vs_voluntary', # 3.648 difference 'Entanglement_self-enforcing_vs_enforced', # 3.628 difference 'Design_explicit_vs_implicit', # High importance 'Entanglement_sovereign_vs_subsidiary', # High importance 'Design_technical_vs_social', # High importance ] def __init__(self, model_path='analysis_results/data'): """Initialize classifier with pre-computed model data.""" self.model_path = Path(model_path) self.scaler = StandardScaler() self.lda = None self.cluster_centroids = None # Load training data to fit scaler and LDA self._load_model() def _load_model(self): """Load and fit the classification model from analysis results.""" # Load the original data and cluster assignments df = pd.read_csv('diagnostic_output.csv') clusters = pd.read_csv(self.model_path / 'kmeans_clusters.csv') # Remove duplicates df = df.drop_duplicates(subset='Descriptor', keep='first') # Merge and clean merged = df.merge(clusters, on='Descriptor') merged_clean = merged.dropna(subset=self.DIMENSIONS) # Prepare training data X = merged_clean[self.DIMENSIONS].values y = merged_clean['cluster'].values # Fit scaler self.scaler.fit(X) X_scaled = self.scaler.transform(X) # Fit LDA self.lda = LinearDiscriminantAnalysis(n_components=1) self.lda.fit(X_scaled, y) # Calculate cluster centroids in scaled space self.cluster_centroids = {} for cluster_id in [1, 2]: cluster_data = X_scaled[y == cluster_id] self.cluster_centroids[cluster_id] = cluster_data.mean(axis=0) def predict(self, ratings, return_details=True): """ Predict cluster for given ratings. Args: ratings: Dict mapping dimension names to values (1-9) Can be partial - missing dimensions are filled with median return_details: If True, returns detailed information Returns: Dict with: - cluster: Predicted cluster number (1 or 2) - cluster_name: Human-readable cluster name - confidence: Confidence score (0-1) - completeness: Fraction of dimensions provided (0-1) - recommended_form: 'short' or 'long' - distance_to_boundary: How far from cluster boundary - lda_score: Score on the discriminant axis """ # Convert ratings to full vector X = np.full(len(self.DIMENSIONS), np.nan) provided_count = 0 for i, dim in enumerate(self.DIMENSIONS): if dim in ratings: X[i] = ratings[dim] provided_count += 1 completeness = provided_count / len(self.DIMENSIONS) # Fill missing values with median (5 - middle of 1-9 scale) X[np.isnan(X)] = 5.0 # Scale X_scaled = self.scaler.transform(X.reshape(1, -1)) # Predict cluster cluster = self.lda.predict(X_scaled)[0] # Get LDA score (position on discriminant axis) lda_score = self.lda.decision_function(X_scaled)[0] # Calculate confidence based on distance from decision boundary # LDA decision boundary is at 0 distance_to_boundary = abs(lda_score) # Confidence: higher when further from boundary # Normalize based on observed data range confidence = min(1.0, distance_to_boundary / 3.0) # 3.0 is typical strong separation # Adjust confidence based on completeness adjusted_confidence = confidence * (0.5 + 0.5 * completeness) # Recommend form # Use long form when: # 1. Low confidence (< 0.6) # 2. Low completeness (< 0.5 of dimensions provided) # 3. Near boundary (< 0.5 distance) if adjusted_confidence < 0.6 or completeness < 0.5 or distance_to_boundary < 0.5: recommended_form = 'long' else: recommended_form = 'short' if not return_details: return { 'cluster': int(cluster), 'cluster_name': self.CLUSTER_NAMES[cluster], 'confidence': float(adjusted_confidence), 'recommended_form': recommended_form } # Calculate distances to each centroid distances = {} for cluster_id, centroid in self.cluster_centroids.items(): dist = np.linalg.norm(X_scaled - centroid) distances[cluster_id] = float(dist) return { 'cluster': int(cluster), 'cluster_name': self.CLUSTER_NAMES[cluster], 'confidence': float(adjusted_confidence), 'completeness': float(completeness), 'dimensions_provided': provided_count, 'dimensions_total': len(self.DIMENSIONS), 'recommended_form': recommended_form, 'distance_to_boundary': float(distance_to_boundary), 'lda_score': float(lda_score), 'distances_to_centroids': distances, 'key_dimensions_provided': sum(1 for dim in self.KEY_DIMENSIONS if dim in ratings), 'key_dimensions_total': len(self.KEY_DIMENSIONS), } def get_key_dimensions(self): """Return the most important dimensions for classification.""" return self.KEY_DIMENSIONS.copy() def get_short_form_dimensions(self): """Return recommended dimensions for short form.""" return self.KEY_DIMENSIONS def explain_classification(self, ratings): """ Provide human-readable explanation of classification. Args: ratings: Dict mapping dimension names to values Returns: String explanation """ result = self.predict(ratings, return_details=True) explanation = [] explanation.append(f"Protocol Classification: {result['cluster_name']}") explanation.append(f"Confidence: {result['confidence']:.0%}") explanation.append(f"") if result['lda_score'] > 0: explanation.append(f"This protocol leans toward Institutional/Bureaucratic characteristics:") explanation.append(f" - More likely to be formal, standardized, top-down") explanation.append(f" - May involve state/corporate enforcement") explanation.append(f" - Tends toward precise, documented procedures") else: explanation.append(f"This protocol leans toward Relational/Cultural characteristics:") explanation.append(f" - More likely to be emergent, community-based") explanation.append(f" - May involve voluntary participation") explanation.append(f" - Tends toward interpretive, flexible practices") explanation.append(f"") explanation.append(f"Distance from boundary: {result['distance_to_boundary']:.2f}") if result['distance_to_boundary'] < 0.5: explanation.append(f"⚠️ This protocol is near the boundary between families.") explanation.append(f" It may exhibit characteristics of both types.") explanation.append(f"") explanation.append(f"Completeness: {result['completeness']:.0%} ({result['dimensions_provided']}/{result['dimensions_total']} dimensions)") if result['completeness'] < 1.0: explanation.append(f"Note: Missing dimensions filled with neutral values (5)") explanation.append(f" Confidence improves with complete data") explanation.append(f"") explanation.append(f"Recommended form: {result['recommended_form'].upper()}") if result['recommended_form'] == 'long': explanation.append(f"Reason: Use long form for:") if result['confidence'] < 0.6: explanation.append(f" - Low classification confidence") if result['completeness'] < 0.5: explanation.append(f" - Incomplete data") if result['distance_to_boundary'] < 0.5: explanation.append(f" - Ambiguous positioning between families") else: explanation.append(f"Reason: High confidence classification with {result['completeness']:.0%} data") return "\n".join(explanation) def save_model(self, output_path='bicorder_classifier_model.json'): """Save model parameters for use without scikit-learn.""" model_data = { 'dimensions': self.DIMENSIONS, 'key_dimensions': self.KEY_DIMENSIONS, 'cluster_names': self.CLUSTER_NAMES, 'scaler_mean': self.scaler.mean_.tolist(), 'scaler_std': self.scaler.scale_.tolist(), 'lda_coef': self.lda.coef_.tolist(), 'lda_intercept': self.lda.intercept_.tolist(), 'cluster_centroids': { str(k): v.tolist() for k, v in self.cluster_centroids.items() } } with open(output_path, 'w') as f: json.dump(model_data, f, indent=2) print(f"Model saved to {output_path}") return output_path def main(): """Demo usage of the classifier.""" print("=" * 80) print("BICORDER CLUSTER CLASSIFIER - DEMO") print("=" * 80) classifier = BicorderClassifier() # Example 1: Relational/Cultural protocol (e.g., Indigenous knowledge sharing) print("\nExample 1: Community-Based Protocol") print("-" * 80) ratings_relational = { 'Design_elite_vs_vernacular': 9, # Very vernacular 'Design_explicit_vs_implicit': 8, # More implicit 'Entanglement_flocking_vs_swarming': 9, # Swarming 'Entanglement_obligatory_vs_voluntary': 9, # Voluntary 'Design_static_vs_malleable': 8, # Malleable 'Design_technical_vs_social': 9, # Social } print(classifier.explain_classification(ratings_relational)) # Example 2: Institutional protocol (e.g., Airport security) print("\n\n" + "=" * 80) print("Example 2: Institutional Protocol") print("-" * 80) ratings_institutional = { 'Design_elite_vs_vernacular': 1, # Elite 'Design_explicit_vs_implicit': 1, # Very explicit 'Entanglement_flocking_vs_swarming': 1, # Flocking 'Entanglement_obligatory_vs_voluntary': 1, # Obligatory 'Design_static_vs_malleable': 2, # Static 'Design_technical_vs_social': 2, # Technical 'Entanglement_sovereign_vs_subsidiary': 1, # Sovereign } print(classifier.explain_classification(ratings_institutional)) # Example 3: Ambiguous/boundary protocol print("\n\n" + "=" * 80) print("Example 3: Boundary Protocol (mixed characteristics)") print("-" * 80) ratings_boundary = { 'Design_elite_vs_vernacular': 5, # Middle 'Design_explicit_vs_implicit': 4, # Slightly implicit 'Entanglement_flocking_vs_swarming': 5, # Middle 'Entanglement_obligatory_vs_voluntary': 6, # Slightly voluntary } print(classifier.explain_classification(ratings_boundary)) # Save model print("\n\n" + "=" * 80) classifier.save_model() print("\nKey dimensions for short form:") for dim in classifier.get_key_dimensions(): print(f" - {dim}") if __name__ == '__main__': main()