From f1ae9cac1ffe021853397046e756c73715b890a3 Mon Sep 17 00:00:00 2001
From: Nathan Schneider <n@nathanschneider.info>
Date: Fri, 20 Mar 2026 15:13:54 -0600
Subject: [PATCH] Derive classifier dimensions from bicorder.json automatically

Both export_model_for_js.py and bicorder_classifier.py now read
DIMENSIONS and KEY_DIMENSIONS directly from bicorder.json at runtime,
so the model stays in sync whenever gradient terms are renamed or
added. A COLUMN_RENAMES dict handles historical CSV column name
changes. The model now includes bicorder_version so the app's version
check works correctly.

Regenerated bicorder_model.json against bicorder.json v1.2.6 with
correct dimension names, 9 key dimensions from shortform flags, and
updated thresholds.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 analysis/bicorder_classifier.py |  74 +++++++++++------------
 analysis/bicorder_model.json    |  31 +++++-----
 analysis/export_model_for_js.py | 100 ++++++++++++++++++--------------
 3 files changed, 108 insertions(+), 97 deletions(-)

diff --git a/analysis/bicorder_classifier.py b/analysis/bicorder_classifier.py
index 3b571e1..0368081 100644
--- a/analysis/bicorder_classifier.py
+++ b/analysis/bicorder_classifier.py
@@ -30,58 +30,46 @@ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 import json
 from pathlib import Path
 
+# Path to bicorder.json (relative to this script)
+_BICORDER_JSON = Path(__file__).parent.parent / 'bicorder.json'
+
+# Historical column renames: maps old CSV column names → current bicorder.json names.
+# Add an entry here whenever gradient terms are renamed in bicorder.json.
+_COLUMN_RENAMES = {
+    'Design_elite_vs_vernacular': 'Design_institutional_vs_vernacular',
+    'Entanglement_exclusive_vs_non-exclusive': 'Entanglement_monopolistic_vs_pluralistic',
+    'Experience_sufficient_vs_insufficient': 'Experience_sufficient_vs_limited',
+    'Experience_Kafka_vs_Whitehead': 'Experience_restraining_vs_liberating',
+}
+
+
+def _load_bicorder_dimensions(bicorder_path=_BICORDER_JSON):
+    """Read DIMENSIONS and KEY_DIMENSIONS from bicorder.json."""
+    with open(bicorder_path) as f:
+        data = json.load(f)
+    dimensions = []
+    key_dimensions = []
+    for category in data['diagnostic']:
+        set_name = category['set_name']
+        for gradient in category['gradients']:
+            dim_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
+            dimensions.append(dim_name)
+            if gradient.get('shortform', False):
+                key_dimensions.append(dim_name)
+    return dimensions, key_dimensions
+
 
 class BicorderClassifier:
     """
     Classifies protocols into one of two families and recommends form type.
     """
 
-    # Dimension names (in order)
-    DIMENSIONS = [
-        'Design_explicit_vs_implicit',
-        'Design_precise_vs_interpretive',
-        'Design_elite_vs_vernacular',
-        'Design_documenting_vs_enabling',
-        'Design_static_vs_malleable',
-        'Design_technical_vs_social',
-        'Design_universal_vs_particular',
-        'Design_durable_vs_ephemeral',
-        'Entanglement_macro_vs_micro',
-        'Entanglement_sovereign_vs_subsidiary',
-        'Entanglement_self-enforcing_vs_enforced',
-        'Entanglement_abstract_vs_embodied',
-        'Entanglement_obligatory_vs_voluntary',
-        'Entanglement_flocking_vs_swarming',
-        'Entanglement_defensible_vs_exposed',
-        'Entanglement_exclusive_vs_non-exclusive',
-        'Experience_sufficient_vs_insufficient',
-        'Experience_crystallized_vs_contested',
-        'Experience_trust-evading_vs_trust-inducing',
-        'Experience_predictable_vs_emergent',
-        'Experience_exclusion_vs_inclusion',
-        'Experience_Kafka_vs_Whitehead',
-        'Experience_dead_vs_alive',
-    ]
-
     # Cluster names
     CLUSTER_NAMES = {
         1: "Relational/Cultural",
         2: "Institutional/Bureaucratic"
     }
 
-    # Key dimensions for short form (most discriminative)
-    # Based on LDA analysis - top differentiating dimensions
-    KEY_DIMENSIONS = [
-        'Design_elite_vs_vernacular',              # 4.602 difference
-        'Entanglement_flocking_vs_swarming',       # 4.079 difference
-        'Design_static_vs_malleable',              # 3.775 difference
-        'Entanglement_obligatory_vs_voluntary',    # 3.648 difference
-        'Entanglement_self-enforcing_vs_enforced', # 3.628 difference
-        'Design_explicit_vs_implicit',             # High importance
-        'Entanglement_sovereign_vs_subsidiary',    # High importance
-        'Design_technical_vs_social',              # High importance
-    ]
-
     def __init__(self, model_path='analysis_results/data'):
         """Initialize classifier with pre-computed model data."""
         self.model_path = Path(model_path)
@@ -89,6 +77,9 @@ class BicorderClassifier:
         self.lda = None
         self.cluster_centroids = None
 
+        # Derive dimension lists from bicorder.json
+        self.DIMENSIONS, self.KEY_DIMENSIONS = _load_bicorder_dimensions()
+
         # Load training data to fit scaler and LDA
         self._load_model()
 
@@ -98,6 +89,9 @@ class BicorderClassifier:
         df = pd.read_csv('diagnostic_output.csv')
         clusters = pd.read_csv(self.model_path / 'kmeans_clusters.csv')
 
+        # Rename old column names to match current bicorder.json
+        df = df.rename(columns=_COLUMN_RENAMES)
+
         # Remove duplicates
         df = df.drop_duplicates(subset='Descriptor', keep='first')
 
diff --git a/analysis/bicorder_model.json b/analysis/bicorder_model.json
index 7899e17..99a47a2 100644
--- a/analysis/bicorder_model.json
+++ b/analysis/bicorder_model.json
@@ -1,10 +1,11 @@
 {
   "version": "1.0",
-  "generated": "2025-12-19T11:46:23.367069",
+  "bicorder_version": "1.2.6",
+  "generated": "2026-03-20T15:08:23.160614",
   "dimensions": [
     "Design_explicit_vs_implicit",
     "Design_precise_vs_interpretive",
-    "Design_elite_vs_vernacular",
+    "Design_institutional_vs_vernacular",
     "Design_documenting_vs_enabling",
     "Design_static_vs_malleable",
     "Design_technical_vs_social",
@@ -17,24 +18,25 @@
     "Entanglement_obligatory_vs_voluntary",
     "Entanglement_flocking_vs_swarming",
     "Entanglement_defensible_vs_exposed",
-    "Entanglement_exclusive_vs_non-exclusive",
-    "Experience_sufficient_vs_insufficient",
+    "Entanglement_monopolistic_vs_pluralistic",
+    "Experience_sufficient_vs_limited",
     "Experience_crystallized_vs_contested",
     "Experience_trust-evading_vs_trust-inducing",
     "Experience_predictable_vs_emergent",
     "Experience_exclusion_vs_inclusion",
-    "Experience_Kafka_vs_Whitehead",
+    "Experience_restraining_vs_liberating",
     "Experience_dead_vs_alive"
   ],
   "key_dimensions": [
-    "Design_elite_vs_vernacular",
-    "Entanglement_flocking_vs_swarming",
+    "Design_precise_vs_interpretive",
+    "Design_institutional_vs_vernacular",
     "Design_static_vs_malleable",
-    "Entanglement_obligatory_vs_voluntary",
-    "Entanglement_self-enforcing_vs_enforced",
-    "Design_explicit_vs_implicit",
     "Entanglement_sovereign_vs_subsidiary",
-    "Design_technical_vs_social"
+    "Entanglement_self-enforcing_vs_enforced",
+    "Entanglement_obligatory_vs_voluntary",
+    "Entanglement_flocking_vs_swarming",
+    "Experience_predictable_vs_emergent",
+    "Experience_exclusion_vs_inclusion"
   ],
   "cluster_names": {
     "1": "Relational/Cultural",
@@ -229,14 +231,13 @@
     ]
   },
   "thresholds": {
-    "confidence_low": 0.5,
+    "confidence_low": 0.6,
     "completeness_low": 0.5,
-    "boundary_distance_low": 0.3
+    "boundary_distance_low": 0.5
   },
   "metadata": {
     "total_protocols": 406,
     "cluster_1_count": 216,
     "cluster_2_count": 190
-  },
-  "bicorder_version": "1.2.3"
+  }
 }
\ No newline at end of file
diff --git a/analysis/export_model_for_js.py b/analysis/export_model_for_js.py
index 24cc5c9..96996f8 100644
--- a/analysis/export_model_for_js.py
+++ b/analysis/export_model_for_js.py
@@ -1,45 +1,67 @@
 #!/usr/bin/env python3
 """
 Export the cluster classification model to JSON for use in JavaScript.
+
+Reads dimension names directly from bicorder.json so the model always
+stays in sync with the current bicorder structure.
+
+When gradients are renamed in bicorder.json, add the old→new mapping to
+COLUMN_RENAMES so the training CSV columns are correctly aligned.
 """
 
+import json
+from pathlib import Path
+
 import pandas as pd
 import numpy as np
 from sklearn.preprocessing import StandardScaler
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
-import json
 
-# Dimension names
-DIMENSIONS = [
-    'Design_explicit_vs_implicit',
-    'Design_precise_vs_interpretive',
-    'Design_elite_vs_vernacular',
-    'Design_documenting_vs_enabling',
-    'Design_static_vs_malleable',
-    'Design_technical_vs_social',
-    'Design_universal_vs_particular',
-    'Design_durable_vs_ephemeral',
-    'Entanglement_macro_vs_micro',
-    'Entanglement_sovereign_vs_subsidiary',
-    'Entanglement_self-enforcing_vs_enforced',
-    'Entanglement_abstract_vs_embodied',
-    'Entanglement_obligatory_vs_voluntary',
-    'Entanglement_flocking_vs_swarming',
-    'Entanglement_defensible_vs_exposed',
-    'Entanglement_exclusive_vs_non-exclusive',
-    'Experience_sufficient_vs_insufficient',
-    'Experience_crystallized_vs_contested',
-    'Experience_trust-evading_vs_trust-inducing',
-    'Experience_predictable_vs_emergent',
-    'Experience_exclusion_vs_inclusion',
-    'Experience_Kafka_vs_Whitehead',
-    'Experience_dead_vs_alive',
-]
+# Path to bicorder.json (relative to this script)
+BICORDER_JSON = Path(__file__).parent.parent / 'bicorder.json'
+
+# Historical column renames: maps old CSV column names → current bicorder.json names.
+# Add an entry here whenever gradient terms are renamed in bicorder.json.
+COLUMN_RENAMES = {
+    'Design_elite_vs_vernacular': 'Design_institutional_vs_vernacular',
+    'Entanglement_exclusive_vs_non-exclusive': 'Entanglement_monopolistic_vs_pluralistic',
+    'Experience_sufficient_vs_insufficient': 'Experience_sufficient_vs_limited',
+    'Experience_Kafka_vs_Whitehead': 'Experience_restraining_vs_liberating',
+}
+
+
+def load_bicorder_dimensions(bicorder_path):
+    """Read DIMENSIONS and KEY_DIMENSIONS from bicorder.json."""
+    with open(bicorder_path) as f:
+        data = json.load(f)
+
+    dimensions = []
+    key_dimensions = []
+
+    for category in data['diagnostic']:
+        set_name = category['set_name']
+        for gradient in category['gradients']:
+            dim_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
+            dimensions.append(dim_name)
+            if gradient.get('shortform', False):
+                key_dimensions.append(dim_name)
+
+    return dimensions, key_dimensions, data['version']
+
+
+# Derive dimensions and version from bicorder.json
+DIMENSIONS, KEY_DIMENSIONS, BICORDER_VERSION = load_bicorder_dimensions(BICORDER_JSON)
+
+print(f"Loaded bicorder.json v{BICORDER_VERSION}")
+print(f"Dimensions: {len(DIMENSIONS)}, key dimensions: {len(KEY_DIMENSIONS)}")
 
 # Load data
 df = pd.read_csv('diagnostic_output.csv')
 clusters = pd.read_csv('analysis_results/data/kmeans_clusters.csv')
 
+# Rename old column names to match current bicorder.json
+df = df.rename(columns=COLUMN_RENAMES)
+
 # Remove duplicates
 df = df.drop_duplicates(subset='Descriptor', keep='first')
 
@@ -47,6 +69,8 @@ df = df.drop_duplicates(subset='Descriptor', keep='first')
 merged = df.merge(clusters, on='Descriptor')
 merged_clean = merged.dropna(subset=DIMENSIONS)
 
+print(f"Training on {len(merged_clean)} protocols")
+
 # Prepare training data
 X = merged_clean[DIMENSIONS].values
 y = merged_clean['cluster'].values
@@ -59,7 +83,7 @@ X_scaled = scaler.fit_transform(X)
 lda = LinearDiscriminantAnalysis(n_components=1)
 lda.fit(X_scaled, y)
 
-# Calculate cluster centroids
+# Calculate cluster centroids in scaled space
 cluster_centroids = {}
 for cluster_id in [1, 2]:
     cluster_data = X_scaled[y == cluster_id]
@@ -71,21 +95,10 @@ for cluster_id in [1, 2]:
     cluster_data_original = X[y == cluster_id]
     cluster_means_original[cluster_id] = cluster_data_original.mean(axis=0).tolist()
 
-# Key dimensions (most discriminative)
-KEY_DIMENSIONS = [
-    'Design_elite_vs_vernacular',
-    'Entanglement_flocking_vs_swarming',
-    'Design_static_vs_malleable',
-    'Entanglement_obligatory_vs_voluntary',
-    'Entanglement_self-enforcing_vs_enforced',
-    'Design_explicit_vs_implicit',
-    'Entanglement_sovereign_vs_subsidiary',
-    'Design_technical_vs_social',
-]
-
 # Build model export
 model = {
     'version': '1.0',
+    'bicorder_version': BICORDER_VERSION,
     'generated': pd.Timestamp.now().isoformat(),
     'dimensions': DIMENSIONS,
     'key_dimensions': KEY_DIMENSIONS,
@@ -124,7 +137,10 @@ output_path = 'bicorder_model.json'
 with open(output_path, 'w') as f:
     json.dump(model, f, indent=2)
 
-print(f"Model exported to {output_path}")
+print(f"\nModel exported to {output_path}")
+print(f"Bicorder version: {BICORDER_VERSION}")
 print(f"Total dimensions: {len(DIMENSIONS)}")
-print(f"Key dimensions for short form: {len(KEY_DIMENSIONS)}")
+print(f"Key dimensions (short form):")
+for dim in KEY_DIMENSIONS:
+    print(f"  - {dim}")
 print(f"Model size: {len(json.dumps(model))} bytes")