Added classifer analysis to bicorder ascii and web app

2025-12-21 21:38:39 -07:00
parent b541f85553
commit 1b508b911f
17 changed files with 2795 additions and 49 deletions
--- a/analysis/bicorder_classifier.py
+++ b/analysis/bicorder_classifier.py
@@ -0,0 +1,366 @@
+#!/usr/bin/env python3
+"""
+Bicorder Cluster Classifier
+
+Provides real-time protocol classification and smart form recommendation
+based on the two-cluster analysis.
+
+Usage:
+    from bicorder_classifier import BicorderClassifier
+
+    classifier = BicorderClassifier()
+
+    # As user fills in dimensions
+    ratings = {
+        'Design_explicit_vs_implicit': 7,
+        'Design_elite_vs_vernacular': 2,
+        # ... etc
+    }
+
+    result = classifier.predict(ratings)
+    print(f"Cluster: {result['cluster']}")
+    print(f"Confidence: {result['confidence']:.1%}")
+    print(f"Recommend form: {result['recommended_form']}")
+"""
+
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+import json
+from pathlib import Path
+
+
+class BicorderClassifier:
+    """
+    Classifies protocols into one of two families and recommends form type.
+    """
+
+    # Dimension names (in order)
+    DIMENSIONS = [
+        'Design_explicit_vs_implicit',
+        'Design_precise_vs_interpretive',
+        'Design_elite_vs_vernacular',
+        'Design_documenting_vs_enabling',
+        'Design_static_vs_malleable',
+        'Design_technical_vs_social',
+        'Design_universal_vs_particular',
+        'Design_durable_vs_ephemeral',
+        'Entanglement_macro_vs_micro',
+        'Entanglement_sovereign_vs_subsidiary',
+        'Entanglement_self-enforcing_vs_enforced',
+        'Entanglement_abstract_vs_embodied',
+        'Entanglement_obligatory_vs_voluntary',
+        'Entanglement_flocking_vs_swarming',
+        'Entanglement_defensible_vs_exposed',
+        'Entanglement_exclusive_vs_non-exclusive',
+        'Experience_sufficient_vs_insufficient',
+        'Experience_crystallized_vs_contested',
+        'Experience_trust-evading_vs_trust-inducing',
+        'Experience_predictable_vs_emergent',
+        'Experience_exclusion_vs_inclusion',
+        'Experience_Kafka_vs_Whitehead',
+        'Experience_dead_vs_alive',
+    ]
+
+    # Cluster names
+    CLUSTER_NAMES = {
+        1: "Relational/Cultural",
+        2: "Institutional/Bureaucratic"
+    }
+
+    # Key dimensions for short form (most discriminative)
+    # Based on LDA analysis - top differentiating dimensions
+    KEY_DIMENSIONS = [
+        'Design_elite_vs_vernacular',              # 4.602 difference
+        'Entanglement_flocking_vs_swarming',       # 4.079 difference
+        'Design_static_vs_malleable',              # 3.775 difference
+        'Entanglement_obligatory_vs_voluntary',    # 3.648 difference
+        'Entanglement_self-enforcing_vs_enforced', # 3.628 difference
+        'Design_explicit_vs_implicit',             # High importance
+        'Entanglement_sovereign_vs_subsidiary',    # High importance
+        'Design_technical_vs_social',              # High importance
+    ]
+
+    def __init__(self, model_path='analysis_results/data'):
+        """Initialize classifier with pre-computed model data."""
+        self.model_path = Path(model_path)
+        self.scaler = StandardScaler()
+        self.lda = None
+        self.cluster_centroids = None
+
+        # Load training data to fit scaler and LDA
+        self._load_model()
+
+    def _load_model(self):
+        """Load and fit the classification model from analysis results."""
+        # Load the original data and cluster assignments
+        df = pd.read_csv('diagnostic_output.csv')
+        clusters = pd.read_csv(self.model_path / 'kmeans_clusters.csv')
+
+        # Remove duplicates
+        df = df.drop_duplicates(subset='Descriptor', keep='first')
+
+        # Merge and clean
+        merged = df.merge(clusters, on='Descriptor')
+        merged_clean = merged.dropna(subset=self.DIMENSIONS)
+
+        # Prepare training data
+        X = merged_clean[self.DIMENSIONS].values
+        y = merged_clean['cluster'].values
+
+        # Fit scaler
+        self.scaler.fit(X)
+        X_scaled = self.scaler.transform(X)
+
+        # Fit LDA
+        self.lda = LinearDiscriminantAnalysis(n_components=1)
+        self.lda.fit(X_scaled, y)
+
+        # Calculate cluster centroids in scaled space
+        self.cluster_centroids = {}
+        for cluster_id in [1, 2]:
+            cluster_data = X_scaled[y == cluster_id]
+            self.cluster_centroids[cluster_id] = cluster_data.mean(axis=0)
+
+    def predict(self, ratings, return_details=True):
+        """
+        Predict cluster for given ratings.
+
+        Args:
+            ratings: Dict mapping dimension names to values (1-9)
+                    Can be partial - missing dimensions are filled with median
+            return_details: If True, returns detailed information
+
+        Returns:
+            Dict with:
+                - cluster: Predicted cluster number (1 or 2)
+                - cluster_name: Human-readable cluster name
+                - confidence: Confidence score (0-1)
+                - completeness: Fraction of dimensions provided (0-1)
+                - recommended_form: 'short' or 'long'
+                - distance_to_boundary: How far from cluster boundary
+                - lda_score: Score on the discriminant axis
+        """
+        # Convert ratings to full vector
+        X = np.full(len(self.DIMENSIONS), np.nan)
+        provided_count = 0
+
+        for i, dim in enumerate(self.DIMENSIONS):
+            if dim in ratings:
+                X[i] = ratings[dim]
+                provided_count += 1
+
+        completeness = provided_count / len(self.DIMENSIONS)
+
+        # Fill missing values with median (5 - middle of 1-9 scale)
+        X[np.isnan(X)] = 5.0
+
+        # Scale
+        X_scaled = self.scaler.transform(X.reshape(1, -1))
+
+        # Predict cluster
+        cluster = self.lda.predict(X_scaled)[0]
+
+        # Get LDA score (position on discriminant axis)
+        lda_score = self.lda.decision_function(X_scaled)[0]
+
+        # Calculate confidence based on distance from decision boundary
+        # LDA decision boundary is at 0
+        distance_to_boundary = abs(lda_score)
+
+        # Confidence: higher when further from boundary
+        # Normalize based on observed data range
+        confidence = min(1.0, distance_to_boundary / 3.0)  # 3.0 is typical strong separation
+
+        # Adjust confidence based on completeness
+        adjusted_confidence = confidence * (0.5 + 0.5 * completeness)
+
+        # Recommend form
+        # Use long form when:
+        # 1. Low confidence (< 0.6)
+        # 2. Low completeness (< 0.5 of dimensions provided)
+        # 3. Near boundary (< 0.5 distance)
+        if adjusted_confidence < 0.6 or completeness < 0.5 or distance_to_boundary < 0.5:
+            recommended_form = 'long'
+        else:
+            recommended_form = 'short'
+
+        if not return_details:
+            return {
+                'cluster': int(cluster),
+                'cluster_name': self.CLUSTER_NAMES[cluster],
+                'confidence': float(adjusted_confidence),
+                'recommended_form': recommended_form
+            }
+
+        # Calculate distances to each centroid
+        distances = {}
+        for cluster_id, centroid in self.cluster_centroids.items():
+            dist = np.linalg.norm(X_scaled - centroid)
+            distances[cluster_id] = float(dist)
+
+        return {
+            'cluster': int(cluster),
+            'cluster_name': self.CLUSTER_NAMES[cluster],
+            'confidence': float(adjusted_confidence),
+            'completeness': float(completeness),
+            'dimensions_provided': provided_count,
+            'dimensions_total': len(self.DIMENSIONS),
+            'recommended_form': recommended_form,
+            'distance_to_boundary': float(distance_to_boundary),
+            'lda_score': float(lda_score),
+            'distances_to_centroids': distances,
+            'key_dimensions_provided': sum(1 for dim in self.KEY_DIMENSIONS if dim in ratings),
+            'key_dimensions_total': len(self.KEY_DIMENSIONS),
+        }
+
+    def get_key_dimensions(self):
+        """Return the most important dimensions for classification."""
+        return self.KEY_DIMENSIONS.copy()
+
+    def get_short_form_dimensions(self):
+        """Return recommended dimensions for short form."""
+        return self.KEY_DIMENSIONS
+
+    def explain_classification(self, ratings):
+        """
+        Provide human-readable explanation of classification.
+
+        Args:
+            ratings: Dict mapping dimension names to values
+
+        Returns:
+            String explanation
+        """
+        result = self.predict(ratings, return_details=True)
+
+        explanation = []
+        explanation.append(f"Protocol Classification: {result['cluster_name']}")
+        explanation.append(f"Confidence: {result['confidence']:.0%}")
+        explanation.append(f"")
+
+        if result['lda_score'] > 0:
+            explanation.append(f"This protocol leans toward Institutional/Bureaucratic characteristics:")
+            explanation.append(f"  - More likely to be formal, standardized, top-down")
+            explanation.append(f"  - May involve state/corporate enforcement")
+            explanation.append(f"  - Tends toward precise, documented procedures")
+        else:
+            explanation.append(f"This protocol leans toward Relational/Cultural characteristics:")
+            explanation.append(f"  - More likely to be emergent, community-based")
+            explanation.append(f"  - May involve voluntary participation")
+            explanation.append(f"  - Tends toward interpretive, flexible practices")
+
+        explanation.append(f"")
+        explanation.append(f"Distance from boundary: {result['distance_to_boundary']:.2f}")
+
+        if result['distance_to_boundary'] < 0.5:
+            explanation.append(f"⚠️  This protocol is near the boundary between families.")
+            explanation.append(f"   It may exhibit characteristics of both types.")
+
+        explanation.append(f"")
+        explanation.append(f"Completeness: {result['completeness']:.0%} ({result['dimensions_provided']}/{result['dimensions_total']} dimensions)")
+
+        if result['completeness'] < 1.0:
+            explanation.append(f"Note: Missing dimensions filled with neutral values (5)")
+            explanation.append(f"      Confidence improves with complete data")
+
+        explanation.append(f"")
+        explanation.append(f"Recommended form: {result['recommended_form'].upper()}")
+
+        if result['recommended_form'] == 'long':
+            explanation.append(f"Reason: Use long form for:")
+            if result['confidence'] < 0.6:
+                explanation.append(f"  - Low classification confidence")
+            if result['completeness'] < 0.5:
+                explanation.append(f"  - Incomplete data")
+            if result['distance_to_boundary'] < 0.5:
+                explanation.append(f"  - Ambiguous positioning between families")
+        else:
+            explanation.append(f"Reason: High confidence classification with {result['completeness']:.0%} data")
+
+        return "\n".join(explanation)
+
+    def save_model(self, output_path='bicorder_classifier_model.json'):
+        """Save model parameters for use without scikit-learn."""
+        model_data = {
+            'dimensions': self.DIMENSIONS,
+            'key_dimensions': self.KEY_DIMENSIONS,
+            'cluster_names': self.CLUSTER_NAMES,
+            'scaler_mean': self.scaler.mean_.tolist(),
+            'scaler_std': self.scaler.scale_.tolist(),
+            'lda_coef': self.lda.coef_.tolist(),
+            'lda_intercept': self.lda.intercept_.tolist(),
+            'cluster_centroids': {
+                str(k): v.tolist() for k, v in self.cluster_centroids.items()
+            }
+        }
+
+        with open(output_path, 'w') as f:
+            json.dump(model_data, f, indent=2)
+
+        print(f"Model saved to {output_path}")
+        return output_path
+
+
+def main():
+    """Demo usage of the classifier."""
+    print("=" * 80)
+    print("BICORDER CLUSTER CLASSIFIER - DEMO")
+    print("=" * 80)
+
+    classifier = BicorderClassifier()
+
+    # Example 1: Relational/Cultural protocol (e.g., Indigenous knowledge sharing)
+    print("\nExample 1: Community-Based Protocol")
+    print("-" * 80)
+    ratings_relational = {
+        'Design_elite_vs_vernacular': 9,  # Very vernacular
+        'Design_explicit_vs_implicit': 8,  # More implicit
+        'Entanglement_flocking_vs_swarming': 9,  # Swarming
+        'Entanglement_obligatory_vs_voluntary': 9,  # Voluntary
+        'Design_static_vs_malleable': 8,  # Malleable
+        'Design_technical_vs_social': 9,  # Social
+    }
+
+    print(classifier.explain_classification(ratings_relational))
+
+    # Example 2: Institutional protocol (e.g., Airport security)
+    print("\n\n" + "=" * 80)
+    print("Example 2: Institutional Protocol")
+    print("-" * 80)
+    ratings_institutional = {
+        'Design_elite_vs_vernacular': 1,  # Elite
+        'Design_explicit_vs_implicit': 1,  # Very explicit
+        'Entanglement_flocking_vs_swarming': 1,  # Flocking
+        'Entanglement_obligatory_vs_voluntary': 1,  # Obligatory
+        'Design_static_vs_malleable': 2,  # Static
+        'Design_technical_vs_social': 2,  # Technical
+        'Entanglement_sovereign_vs_subsidiary': 1,  # Sovereign
+    }
+
+    print(classifier.explain_classification(ratings_institutional))
+
+    # Example 3: Ambiguous/boundary protocol
+    print("\n\n" + "=" * 80)
+    print("Example 3: Boundary Protocol (mixed characteristics)")
+    print("-" * 80)
+    ratings_boundary = {
+        'Design_elite_vs_vernacular': 5,  # Middle
+        'Design_explicit_vs_implicit': 4,  # Slightly implicit
+        'Entanglement_flocking_vs_swarming': 5,  # Middle
+        'Entanglement_obligatory_vs_voluntary': 6,  # Slightly voluntary
+    }
+
+    print(classifier.explain_classification(ratings_boundary))
+
+    # Save model
+    print("\n\n" + "=" * 80)
+    classifier.save_model()
+    print("\nKey dimensions for short form:")
+    for dim in classifier.get_key_dimensions():
+        print(f"  - {dim}")
+
+
+if __name__ == '__main__':
+    main()