protocol-bicorder/analysis/bicorder_classifier.py

#!/usr/bin/env python3
"""
Bicorder Cluster Classifier

Provides real-time protocol classification and smart form recommendation
based on the two-cluster analysis.

Usage:
    from bicorder_classifier import BicorderClassifier

    classifier = BicorderClassifier()

    # As user fills in dimensions
    ratings = {
        'Design_explicit_vs_implicit': 7,
        'Design_elite_vs_vernacular': 2,
        # ... etc
    }

    result = classifier.predict(ratings)
    print(f"Cluster: {result['cluster']}")
    print(f"Confidence: {result['confidence']:.1%}")
    print(f"Recommend form: {result['recommended_form']}")
"""

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import json
from pathlib import Path


class BicorderClassifier:
    """
    Classifies protocols into one of two families and recommends form type.
    """

    # Dimension names (in order)
    DIMENSIONS = [
        'Design_explicit_vs_implicit',
        'Design_precise_vs_interpretive',
        'Design_elite_vs_vernacular',
        'Design_documenting_vs_enabling',
        'Design_static_vs_malleable',
        'Design_technical_vs_social',
        'Design_universal_vs_particular',
        'Design_durable_vs_ephemeral',
        'Entanglement_macro_vs_micro',
        'Entanglement_sovereign_vs_subsidiary',
        'Entanglement_self-enforcing_vs_enforced',
        'Entanglement_abstract_vs_embodied',
        'Entanglement_obligatory_vs_voluntary',
        'Entanglement_flocking_vs_swarming',
        'Entanglement_defensible_vs_exposed',
        'Entanglement_exclusive_vs_non-exclusive',
        'Experience_sufficient_vs_insufficient',
        'Experience_crystallized_vs_contested',
        'Experience_trust-evading_vs_trust-inducing',
        'Experience_predictable_vs_emergent',
        'Experience_exclusion_vs_inclusion',
        'Experience_Kafka_vs_Whitehead',
        'Experience_dead_vs_alive',
    ]

    # Cluster names
    CLUSTER_NAMES = {
        1: "Relational/Cultural",
        2: "Institutional/Bureaucratic"
    }

    # Key dimensions for short form (most discriminative)
    # Based on LDA analysis - top differentiating dimensions
    KEY_DIMENSIONS = [
        'Design_elite_vs_vernacular',              # 4.602 difference
        'Entanglement_flocking_vs_swarming',       # 4.079 difference
        'Design_static_vs_malleable',              # 3.775 difference
        'Entanglement_obligatory_vs_voluntary',    # 3.648 difference
        'Entanglement_self-enforcing_vs_enforced', # 3.628 difference
        'Design_explicit_vs_implicit',             # High importance
        'Entanglement_sovereign_vs_subsidiary',    # High importance
        'Design_technical_vs_social',              # High importance
    ]

    def __init__(self, model_path='analysis_results/data'):
        """Initialize classifier with pre-computed model data."""
        self.model_path = Path(model_path)
        self.scaler = StandardScaler()
        self.lda = None
        self.cluster_centroids = None

        # Load training data to fit scaler and LDA
        self._load_model()

    def _load_model(self):
        """Load and fit the classification model from analysis results."""
        # Load the original data and cluster assignments
        df = pd.read_csv('diagnostic_output.csv')
        clusters = pd.read_csv(self.model_path / 'kmeans_clusters.csv')

        # Remove duplicates
        df = df.drop_duplicates(subset='Descriptor', keep='first')

        # Merge and clean
        merged = df.merge(clusters, on='Descriptor')
        merged_clean = merged.dropna(subset=self.DIMENSIONS)

        # Prepare training data
        X = merged_clean[self.DIMENSIONS].values
        y = merged_clean['cluster'].values

        # Fit scaler
        self.scaler.fit(X)
        X_scaled = self.scaler.transform(X)

        # Fit LDA
        self.lda = LinearDiscriminantAnalysis(n_components=1)
        self.lda.fit(X_scaled, y)

        # Calculate cluster centroids in scaled space
        self.cluster_centroids = {}
        for cluster_id in [1, 2]:
            cluster_data = X_scaled[y == cluster_id]
            self.cluster_centroids[cluster_id] = cluster_data.mean(axis=0)

    def predict(self, ratings, return_details=True):
        """
        Predict cluster for given ratings.

        Args:
            ratings: Dict mapping dimension names to values (1-9)
                    Can be partial - missing dimensions are filled with median
            return_details: If True, returns detailed information

        Returns:
            Dict with:
                - cluster: Predicted cluster number (1 or 2)
                - cluster_name: Human-readable cluster name
                - confidence: Confidence score (0-1)
                - completeness: Fraction of dimensions provided (0-1)
                - recommended_form: 'short' or 'long'
                - distance_to_boundary: How far from cluster boundary
                - lda_score: Score on the discriminant axis
        """
        # Convert ratings to full vector
        X = np.full(len(self.DIMENSIONS), np.nan)
        provided_count = 0

        for i, dim in enumerate(self.DIMENSIONS):
            if dim in ratings:
                X[i] = ratings[dim]
                provided_count += 1

        completeness = provided_count / len(self.DIMENSIONS)

        # Fill missing values with median (5 - middle of 1-9 scale)
        X[np.isnan(X)] = 5.0

        # Scale
        X_scaled = self.scaler.transform(X.reshape(1, -1))

        # Predict cluster
        cluster = self.lda.predict(X_scaled)[0]

        # Get LDA score (position on discriminant axis)
        lda_score = self.lda.decision_function(X_scaled)[0]

        # Calculate confidence based on distance from decision boundary
        # LDA decision boundary is at 0
        distance_to_boundary = abs(lda_score)

        # Confidence: higher when further from boundary
        # Normalize based on observed data range
        confidence = min(1.0, distance_to_boundary / 3.0)  # 3.0 is typical strong separation

        # Adjust confidence based on completeness
        adjusted_confidence = confidence * (0.5 + 0.5 * completeness)

        # Recommend form
        # Use long form when:
        # 1. Low confidence (< 0.6)
        # 2. Low completeness (< 0.5 of dimensions provided)
        # 3. Near boundary (< 0.5 distance)
        if adjusted_confidence < 0.6 or completeness < 0.5 or distance_to_boundary < 0.5:
            recommended_form = 'long'
        else:
            recommended_form = 'short'

        if not return_details:
            return {
                'cluster': int(cluster),
                'cluster_name': self.CLUSTER_NAMES[cluster],
                'confidence': float(adjusted_confidence),
                'recommended_form': recommended_form
            }

        # Calculate distances to each centroid
        distances = {}
        for cluster_id, centroid in self.cluster_centroids.items():
            dist = np.linalg.norm(X_scaled - centroid)
            distances[cluster_id] = float(dist)

        return {
            'cluster': int(cluster),
            'cluster_name': self.CLUSTER_NAMES[cluster],
            'confidence': float(adjusted_confidence),
            'completeness': float(completeness),
            'dimensions_provided': provided_count,
            'dimensions_total': len(self.DIMENSIONS),
            'recommended_form': recommended_form,
            'distance_to_boundary': float(distance_to_boundary),
            'lda_score': float(lda_score),
            'distances_to_centroids': distances,
            'key_dimensions_provided': sum(1 for dim in self.KEY_DIMENSIONS if dim in ratings),
            'key_dimensions_total': len(self.KEY_DIMENSIONS),
        }

    def get_key_dimensions(self):
        """Return the most important dimensions for classification."""
        return self.KEY_DIMENSIONS.copy()

    def get_short_form_dimensions(self):
        """Return recommended dimensions for short form."""
        return self.KEY_DIMENSIONS

    def explain_classification(self, ratings):
        """
        Provide human-readable explanation of classification.

        Args:
            ratings: Dict mapping dimension names to values

        Returns:
            String explanation
        """
        result = self.predict(ratings, return_details=True)

        explanation = []
        explanation.append(f"Protocol Classification: {result['cluster_name']}")
        explanation.append(f"Confidence: {result['confidence']:.0%}")
        explanation.append(f"")

        if result['lda_score'] > 0:
            explanation.append(f"This protocol leans toward Institutional/Bureaucratic characteristics:")
            explanation.append(f"  - More likely to be formal, standardized, top-down")
            explanation.append(f"  - May involve state/corporate enforcement")
            explanation.append(f"  - Tends toward precise, documented procedures")
        else:
            explanation.append(f"This protocol leans toward Relational/Cultural characteristics:")
            explanation.append(f"  - More likely to be emergent, community-based")
            explanation.append(f"  - May involve voluntary participation")
            explanation.append(f"  - Tends toward interpretive, flexible practices")

        explanation.append(f"")
        explanation.append(f"Distance from boundary: {result['distance_to_boundary']:.2f}")

        if result['distance_to_boundary'] < 0.5:
            explanation.append(f"⚠️  This protocol is near the boundary between families.")
            explanation.append(f"   It may exhibit characteristics of both types.")

        explanation.append(f"")
        explanation.append(f"Completeness: {result['completeness']:.0%} ({result['dimensions_provided']}/{result['dimensions_total']} dimensions)")

        if result['completeness'] < 1.0:
            explanation.append(f"Note: Missing dimensions filled with neutral values (5)")
            explanation.append(f"      Confidence improves with complete data")

        explanation.append(f"")
        explanation.append(f"Recommended form: {result['recommended_form'].upper()}")

        if result['recommended_form'] == 'long':
            explanation.append(f"Reason: Use long form for:")
            if result['confidence'] < 0.6:
                explanation.append(f"  - Low classification confidence")
            if result['completeness'] < 0.5:
                explanation.append(f"  - Incomplete data")
            if result['distance_to_boundary'] < 0.5:
                explanation.append(f"  - Ambiguous positioning between families")
        else:
            explanation.append(f"Reason: High confidence classification with {result['completeness']:.0%} data")

        return "\n".join(explanation)

    def save_model(self, output_path='bicorder_classifier_model.json'):
        """Save model parameters for use without scikit-learn."""
        model_data = {
            'dimensions': self.DIMENSIONS,
            'key_dimensions': self.KEY_DIMENSIONS,
            'cluster_names': self.CLUSTER_NAMES,
            'scaler_mean': self.scaler.mean_.tolist(),
            'scaler_std': self.scaler.scale_.tolist(),
            'lda_coef': self.lda.coef_.tolist(),
            'lda_intercept': self.lda.intercept_.tolist(),
            'cluster_centroids': {
                str(k): v.tolist() for k, v in self.cluster_centroids.items()
            }
        }

        with open(output_path, 'w') as f:
            json.dump(model_data, f, indent=2)

        print(f"Model saved to {output_path}")
        return output_path


def main():
    """Demo usage of the classifier."""
    print("=" * 80)
    print("BICORDER CLUSTER CLASSIFIER - DEMO")
    print("=" * 80)

    classifier = BicorderClassifier()

    # Example 1: Relational/Cultural protocol (e.g., Indigenous knowledge sharing)
    print("\nExample 1: Community-Based Protocol")
    print("-" * 80)
    ratings_relational = {
        'Design_elite_vs_vernacular': 9,  # Very vernacular
        'Design_explicit_vs_implicit': 8,  # More implicit
        'Entanglement_flocking_vs_swarming': 9,  # Swarming
        'Entanglement_obligatory_vs_voluntary': 9,  # Voluntary
        'Design_static_vs_malleable': 8,  # Malleable
        'Design_technical_vs_social': 9,  # Social
    }

    print(classifier.explain_classification(ratings_relational))

    # Example 2: Institutional protocol (e.g., Airport security)
    print("\n\n" + "=" * 80)
    print("Example 2: Institutional Protocol")
    print("-" * 80)
    ratings_institutional = {
        'Design_elite_vs_vernacular': 1,  # Elite
        'Design_explicit_vs_implicit': 1,  # Very explicit
        'Entanglement_flocking_vs_swarming': 1,  # Flocking
        'Entanglement_obligatory_vs_voluntary': 1,  # Obligatory
        'Design_static_vs_malleable': 2,  # Static
        'Design_technical_vs_social': 2,  # Technical
        'Entanglement_sovereign_vs_subsidiary': 1,  # Sovereign
    }

    print(classifier.explain_classification(ratings_institutional))

    # Example 3: Ambiguous/boundary protocol
    print("\n\n" + "=" * 80)
    print("Example 3: Boundary Protocol (mixed characteristics)")
    print("-" * 80)
    ratings_boundary = {
        'Design_elite_vs_vernacular': 5,  # Middle
        'Design_explicit_vs_implicit': 4,  # Slightly implicit
        'Entanglement_flocking_vs_swarming': 5,  # Middle
        'Entanglement_obligatory_vs_voluntary': 6,  # Slightly voluntary
    }

    print(classifier.explain_classification(ratings_boundary))

    # Save model
    print("\n\n" + "=" * 80)
    classifier.save_model()
    print("\nKey dimensions for short form:")
    for dim in classifier.get_key_dimensions():
        print(f"  - {dim}")


if __name__ == '__main__':
    main()