Derive classifier dimensions from bicorder.json automatically

Both export_model_for_js.py and bicorder_classifier.py now read DIMENSIONS and KEY_DIMENSIONS directly from bicorder.json at runtime, so the model stays in sync whenever gradient terms are renamed or added. A COLUMN_RENAMES dict handles historical CSV column name changes. The model now includes bicorder_version so the app's version check works correctly. Regenerated bicorder_model.json against bicorder.json v1.2.6 with correct dimension names, 9 key dimensions from shortform flags, and updated thresholds. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-20 15:13:54 -06:00
parent 5232e760be
commit f1ae9cac1f
3 changed files with 108 additions and 97 deletions
@@ -30,58 +30,46 @@ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 import json
 from pathlib import Path
 # Path to bicorder.json (relative to this script)
 _BICORDER_JSON = Path(__file__).parent.parent / 'bicorder.json'
 # Historical column renames: maps old CSV column names → current bicorder.json names.
 # Add an entry here whenever gradient terms are renamed in bicorder.json.
 _COLUMN_RENAMES = {
    'Design_elite_vs_vernacular': 'Design_institutional_vs_vernacular',
    'Entanglement_exclusive_vs_non-exclusive': 'Entanglement_monopolistic_vs_pluralistic',
    'Experience_sufficient_vs_insufficient': 'Experience_sufficient_vs_limited',
    'Experience_Kafka_vs_Whitehead': 'Experience_restraining_vs_liberating',
 }
 def _load_bicorder_dimensions(bicorder_path=_BICORDER_JSON):
    """Read DIMENSIONS and KEY_DIMENSIONS from bicorder.json."""
    with open(bicorder_path) as f:
        data = json.load(f)
    dimensions = []
    key_dimensions = []
    for category in data['diagnostic']:
        set_name = category['set_name']
        for gradient in category['gradients']:
            dim_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
            dimensions.append(dim_name)
            if gradient.get('shortform', False):
                key_dimensions.append(dim_name)
    return dimensions, key_dimensions
 class BicorderClassifier:
    """
    Classifies protocols into one of two families and recommends form type.
    """
    # Dimension names (in order)
    DIMENSIONS = [
        'Design_explicit_vs_implicit',
        'Design_precise_vs_interpretive',
        'Design_elite_vs_vernacular',
        'Design_documenting_vs_enabling',
        'Design_static_vs_malleable',
        'Design_technical_vs_social',
        'Design_universal_vs_particular',
        'Design_durable_vs_ephemeral',
        'Entanglement_macro_vs_micro',
        'Entanglement_sovereign_vs_subsidiary',
        'Entanglement_self-enforcing_vs_enforced',
        'Entanglement_abstract_vs_embodied',
        'Entanglement_obligatory_vs_voluntary',
        'Entanglement_flocking_vs_swarming',
        'Entanglement_defensible_vs_exposed',
        'Entanglement_exclusive_vs_non-exclusive',
        'Experience_sufficient_vs_insufficient',
        'Experience_crystallized_vs_contested',
        'Experience_trust-evading_vs_trust-inducing',
        'Experience_predictable_vs_emergent',
        'Experience_exclusion_vs_inclusion',
        'Experience_Kafka_vs_Whitehead',
        'Experience_dead_vs_alive',
    ]
    # Cluster names
    CLUSTER_NAMES = {
        1: "Relational/Cultural",
        2: "Institutional/Bureaucratic"
    }
    # Key dimensions for short form (most discriminative)
    # Based on LDA analysis - top differentiating dimensions
    KEY_DIMENSIONS = [
        'Design_elite_vs_vernacular',              # 4.602 difference
        'Entanglement_flocking_vs_swarming',       # 4.079 difference
        'Design_static_vs_malleable',              # 3.775 difference
        'Entanglement_obligatory_vs_voluntary',    # 3.648 difference
        'Entanglement_self-enforcing_vs_enforced', # 3.628 difference
        'Design_explicit_vs_implicit',             # High importance
        'Entanglement_sovereign_vs_subsidiary',    # High importance
        'Design_technical_vs_social',              # High importance
    ]
    def __init__(self, model_path='analysis_results/data'):
        """Initialize classifier with pre-computed model data."""
        self.model_path = Path(model_path)
@@ -89,6 +77,9 @@ class BicorderClassifier:
        self.lda = None
        self.cluster_centroids = None
        # Derive dimension lists from bicorder.json
        self.DIMENSIONS, self.KEY_DIMENSIONS = _load_bicorder_dimensions()
        # Load training data to fit scaler and LDA
        self._load_model()
@@ -98,6 +89,9 @@ class BicorderClassifier:
        df = pd.read_csv('diagnostic_output.csv')
        clusters = pd.read_csv(self.model_path / 'kmeans_clusters.csv')
        # Rename old column names to match current bicorder.json
        df = df.rename(columns=_COLUMN_RENAMES)
        # Remove duplicates
        df = df.drop_duplicates(subset='Descriptor', keep='first')
@@ -1,10 +1,11 @@
 {
  "version": "1.0",
-  "generated": "2025-12-19T11:46:23.367069",
+  "bicorder_version": "1.2.6",
  "generated": "2026-03-20T15:08:23.160614",
  "dimensions": [
    "Design_explicit_vs_implicit",
    "Design_precise_vs_interpretive",
-    "Design_elite_vs_vernacular",
+    "Design_institutional_vs_vernacular",
    "Design_documenting_vs_enabling",
    "Design_static_vs_malleable",
    "Design_technical_vs_social",
@@ -17,24 +18,25 @@
    "Entanglement_obligatory_vs_voluntary",
    "Entanglement_flocking_vs_swarming",
    "Entanglement_defensible_vs_exposed",
-    "Entanglement_exclusive_vs_non-exclusive",
+    "Entanglement_monopolistic_vs_pluralistic",
-    "Experience_sufficient_vs_insufficient",
+    "Experience_sufficient_vs_limited",
    "Experience_crystallized_vs_contested",
    "Experience_trust-evading_vs_trust-inducing",
    "Experience_predictable_vs_emergent",
    "Experience_exclusion_vs_inclusion",
-    "Experience_Kafka_vs_Whitehead",
+    "Experience_restraining_vs_liberating",
    "Experience_dead_vs_alive"
  ],
  "key_dimensions": [
-    "Design_elite_vs_vernacular",
+    "Design_precise_vs_interpretive",
-    "Entanglement_flocking_vs_swarming",
+    "Design_institutional_vs_vernacular",
    "Design_static_vs_malleable",
    "Entanglement_obligatory_vs_voluntary",
    "Entanglement_self-enforcing_vs_enforced",
    "Design_explicit_vs_implicit",
    "Entanglement_sovereign_vs_subsidiary",
-    "Design_technical_vs_social"
+    "Entanglement_self-enforcing_vs_enforced",
    "Entanglement_obligatory_vs_voluntary",
    "Entanglement_flocking_vs_swarming",
    "Experience_predictable_vs_emergent",
    "Experience_exclusion_vs_inclusion"
  ],
  "cluster_names": {
    "1": "Relational/Cultural",
@@ -229,14 +231,13 @@
    ]
  },
  "thresholds": {
-    "confidence_low": 0.5,
+    "confidence_low": 0.6,
    "completeness_low": 0.5,
-    "boundary_distance_low": 0.3
+    "boundary_distance_low": 0.5
  },
  "metadata": {
    "total_protocols": 406,
    "cluster_1_count": 216,
    "cluster_2_count": 190
-  },
+  }
  "bicorder_version": "1.2.3"
 }
@@ -1,45 +1,67 @@
 #!/usr/bin/env python3
 """
 Export the cluster classification model to JSON for use in JavaScript.
 Reads dimension names directly from bicorder.json so the model always
 stays in sync with the current bicorder structure.
 When gradients are renamed in bicorder.json, add the old→new mapping to
 COLUMN_RENAMES so the training CSV columns are correctly aligned.
 """
 import json
 from pathlib import Path
 import pandas as pd
 import numpy as np
 from sklearn.preprocessing import StandardScaler
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 import json
-# Dimension names
+# Path to bicorder.json (relative to this script)
-DIMENSIONS = [
+BICORDER_JSON = Path(__file__).parent.parent / 'bicorder.json'
-    'Design_explicit_vs_implicit',
+
-    'Design_precise_vs_interpretive',
+# Historical column renames: maps old CSV column names → current bicorder.json names.
-    'Design_elite_vs_vernacular',
+# Add an entry here whenever gradient terms are renamed in bicorder.json.
-    'Design_documenting_vs_enabling',
+COLUMN_RENAMES = {
-    'Design_static_vs_malleable',
+    'Design_elite_vs_vernacular': 'Design_institutional_vs_vernacular',
-    'Design_technical_vs_social',
+    'Entanglement_exclusive_vs_non-exclusive': 'Entanglement_monopolistic_vs_pluralistic',
-    'Design_universal_vs_particular',
+    'Experience_sufficient_vs_insufficient': 'Experience_sufficient_vs_limited',
-    'Design_durable_vs_ephemeral',
+    'Experience_Kafka_vs_Whitehead': 'Experience_restraining_vs_liberating',
-    'Entanglement_macro_vs_micro',
+}
-    'Entanglement_sovereign_vs_subsidiary',
+
-    'Entanglement_self-enforcing_vs_enforced',
+
-    'Entanglement_abstract_vs_embodied',
+def load_bicorder_dimensions(bicorder_path):
-    'Entanglement_obligatory_vs_voluntary',
+    """Read DIMENSIONS and KEY_DIMENSIONS from bicorder.json."""
-    'Entanglement_flocking_vs_swarming',
+    with open(bicorder_path) as f:
-    'Entanglement_defensible_vs_exposed',
+        data = json.load(f)
-    'Entanglement_exclusive_vs_non-exclusive',
+
-    'Experience_sufficient_vs_insufficient',
+    dimensions = []
-    'Experience_crystallized_vs_contested',
+    key_dimensions = []
-    'Experience_trust-evading_vs_trust-inducing',
+
-    'Experience_predictable_vs_emergent',
+    for category in data['diagnostic']:
-    'Experience_exclusion_vs_inclusion',
+        set_name = category['set_name']
-    'Experience_Kafka_vs_Whitehead',
+        for gradient in category['gradients']:
-    'Experience_dead_vs_alive',
+            dim_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
-]
+            dimensions.append(dim_name)
            if gradient.get('shortform', False):
                key_dimensions.append(dim_name)
    return dimensions, key_dimensions, data['version']
 # Derive dimensions and version from bicorder.json
 DIMENSIONS, KEY_DIMENSIONS, BICORDER_VERSION = load_bicorder_dimensions(BICORDER_JSON)
 print(f"Loaded bicorder.json v{BICORDER_VERSION}")
 print(f"Dimensions: {len(DIMENSIONS)}, key dimensions: {len(KEY_DIMENSIONS)}")
 # Load data
 df = pd.read_csv('diagnostic_output.csv')
 clusters = pd.read_csv('analysis_results/data/kmeans_clusters.csv')
 # Rename old column names to match current bicorder.json
 df = df.rename(columns=COLUMN_RENAMES)
 # Remove duplicates
 df = df.drop_duplicates(subset='Descriptor', keep='first')
@@ -47,6 +69,8 @@ df = df.drop_duplicates(subset='Descriptor', keep='first')
 merged = df.merge(clusters, on='Descriptor')
 merged_clean = merged.dropna(subset=DIMENSIONS)
 print(f"Training on {len(merged_clean)} protocols")
 # Prepare training data
 X = merged_clean[DIMENSIONS].values
 y = merged_clean['cluster'].values
@@ -59,7 +83,7 @@ X_scaled = scaler.fit_transform(X)
 lda = LinearDiscriminantAnalysis(n_components=1)
 lda.fit(X_scaled, y)
-# Calculate cluster centroids
+# Calculate cluster centroids in scaled space
 cluster_centroids = {}
 for cluster_id in [1, 2]:
    cluster_data = X_scaled[y == cluster_id]
@@ -71,21 +95,10 @@ for cluster_id in [1, 2]:
    cluster_data_original = X[y == cluster_id]
    cluster_means_original[cluster_id] = cluster_data_original.mean(axis=0).tolist()
 # Key dimensions (most discriminative)
 KEY_DIMENSIONS = [
    'Design_elite_vs_vernacular',
    'Entanglement_flocking_vs_swarming',
    'Design_static_vs_malleable',
    'Entanglement_obligatory_vs_voluntary',
    'Entanglement_self-enforcing_vs_enforced',
    'Design_explicit_vs_implicit',
    'Entanglement_sovereign_vs_subsidiary',
    'Design_technical_vs_social',
 ]
 # Build model export
 model = {
    'version': '1.0',
    'bicorder_version': BICORDER_VERSION,
    'generated': pd.Timestamp.now().isoformat(),
    'dimensions': DIMENSIONS,
    'key_dimensions': KEY_DIMENSIONS,
@@ -124,7 +137,10 @@ output_path = 'bicorder_model.json'
 with open(output_path, 'w') as f:
    json.dump(model, f, indent=2)
-print(f"Model exported to {output_path}")
+print(f"\nModel exported to {output_path}")
 print(f"Bicorder version: {BICORDER_VERSION}")
 print(f"Total dimensions: {len(DIMENSIONS)}")
-print(f"Key dimensions for short form: {len(KEY_DIMENSIONS)}")
+print(f"Key dimensions (short form):")
 for dim in KEY_DIMENSIONS:
    print(f"  - {dim}")
 print(f"Model size: {len(json.dumps(model))} bytes")