Reorganize directory, add manual dataset and sync tooling

- Move all scripts to scripts/, web assets to web/, analysis results into self-contained data/readings/<type>_<YYYYMMDD>/ directories - Add data/readings/manual_20260320/ with 32 JSON readings from git.medlab.host/ntnsndr/protocol-bicorder-data - Add scripts/json_to_csv.py to convert bicorder JSON files to CSV - Add scripts/sync_readings.sh for one-command sync + re-analysis of any dataset backed by a .sync_source config file - Add scripts/classify_readings.py to apply the LDA classifier to all readings and save per-reading cluster assignments - Add --min-coverage flag to multivariate_analysis.py for sparse/shortform datasets; also applies in lda_visualization.py - Fix lda_visualization.py NaN handling and 0-d array annotation bug - Update README.md and WORKFLOW.md to document datasets, sync workflow, shortform handling, and new scripts Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-20 17:35:13 -06:00
parent 0c794dddae
commit 897c30406b
545 changed files with 10715 additions and 718 deletions
@@ -0,0 +1,98 @@
+/**
+ * Type definitions for Bicorder Cluster Classifier
+ */
+
+export interface ModelData {
+  version: string;
+  generated: string;
+  dimensions: string[];
+  key_dimensions: string[];
+  cluster_names: {
+    '1': string;
+    '2': string;
+  };
+  cluster_descriptions: {
+    '1': string;
+    '2': string;
+  };
+  scaler: {
+    mean: number[];
+    scale: number[];
+  };
+  lda: {
+    coefficients: number[];
+    intercept: number;
+  };
+  cluster_centroids_scaled: {
+    '1': number[];
+    '2': number[];
+  };
+  cluster_means_original: {
+    '1': number[];
+    '2': number[];
+  };
+  thresholds: {
+    confidence_low: number;
+    completeness_low: number;
+    boundary_distance_low: number;
+  };
+  metadata: {
+    total_protocols: number;
+    cluster_1_count: number;
+    cluster_2_count: number;
+  };
+}
+
+export interface Ratings {
+  [dimensionName: string]: number | null | undefined;
+}
+
+export interface PredictionResult {
+  cluster: 1 | 2;
+  clusterName: string;
+  confidence: number;
+  completeness: number;
+  recommendedForm: 'short' | 'long';
+}
+
+export interface DetailedPredictionResult extends PredictionResult {
+  ldaScore: number;
+  distanceToBoundary: number;
+  dimensionsProvided: number;
+  dimensionsTotal: number;
+  keyDimensionsProvided: number;
+  keyDimensionsTotal: number;
+  distancesToCentroids: {
+    '1': number;
+    '2': number;
+  };
+  rawConfidence: number;
+}
+
+export interface ShortFormAssessment {
+  ready: boolean;
+  keyDimensionsProvided: number;
+  keyDimensionsTotal: number;
+  coverage: number;
+  missingKeyDimensions: string[];
+}
+
+export interface PredictOptions {
+  detailed?: boolean;
+}
+
+export class BicorderClassifier {
+  constructor(model: ModelData);
+
+  predict(ratings: Ratings, options?: { detailed: false }): PredictionResult;
+  predict(ratings: Ratings, options: { detailed: true }): DetailedPredictionResult;
+  predict(ratings: Ratings, options?: PredictOptions): PredictionResult | DetailedPredictionResult;
+
+  explainClassification(ratings: Ratings): string;
+
+  getKeyDimensions(): string[];
+
+  assessShortFormReadiness(ratings: Ratings): ShortFormAssessment;
+}
+
+export function loadClassifier(url?: string): Promise<BicorderClassifier>;
@@ -0,0 +1,335 @@
+/**
+ * Bicorder Cluster Classifier
+ *
+ * Real-time protocol classification for the Bicorder web app.
+ * Predicts which protocol family (Relational/Cultural vs Institutional/Bureaucratic)
+ * a protocol belongs to based on dimension ratings.
+ *
+ * Usage:
+ *   import { BicorderClassifier } from './bicorder-classifier.js';
+ *
+ *   const classifier = new BicorderClassifier(modelData);
+ *   const result = classifier.predict(ratings);
+ *   console.log(`Cluster: ${result.clusterName} (${result.confidence}% confidence)`);
+ */
+
+export class BicorderClassifier {
+  /**
+   * @param {Object} model - Model data loaded from bicorder_model.json
+   * @param {string} bicorderVersion - Version of bicorder.json being used
+   *
+   * Simple version-matching approach: The model includes a bicorder_version
+   * field. When bicorder structure changes, update the version and retrain.
+   */
+  constructor(model, bicorderVersion = null) {
+    this.model = model;
+    this.dimensions = model.dimensions;
+    this.keyDimensions = model.key_dimensions;
+    this.bicorderVersion = bicorderVersion;
+
+    // Check version compatibility
+    if (bicorderVersion && model.bicorder_version && bicorderVersion !== model.bicorder_version) {
+      console.warn(`Model version (${model.bicorder_version}) doesn't match bicorder version (${bicorderVersion}). Results may be inaccurate.`);
+    }
+  }
+
+  /**
+   * Standardize values using the fitted scaler
+   * @private
+   */
+  _standardize(values) {
+    return values.map((val, i) => {
+      if (val === null || val === undefined) return null;
+      return (val - this.model.scaler.mean[i]) / this.model.scaler.scale[i];
+    });
+  }
+
+  /**
+   * Calculate LDA score (position on discriminant axis)
+   * @private
+   */
+  _ldaScore(scaledValues) {
+    // Fill missing values with 0 (mean in scaled space)
+    const filled = scaledValues.map(v => v === null ? 0 : v);
+
+    // Calculate: coef · x + intercept
+    let score = this.model.lda.intercept;
+    for (let i = 0; i < filled.length; i++) {
+      score += this.model.lda.coefficients[i] * filled[i];
+    }
+    return score;
+  }
+
+  /**
+   * Calculate Euclidean distance
+   * @private
+   */
+  _distance(a, b) {
+    let sum = 0;
+    for (let i = 0; i < a.length; i++) {
+      const diff = a[i] - b[i];
+      sum += diff * diff;
+    }
+    return Math.sqrt(sum);
+  }
+
+  /**
+   * Predict cluster for given ratings
+   *
+   * @param {Object} ratings - Map of dimension names to values (1-9)
+   *                          Can be partial - missing dimensions handled gracefully
+   * @param {Object} options - Options
+   * @param {boolean} options.detailed - Return detailed information (default: true)
+   *
+   * @returns {Object} Prediction result with:
+   *   - cluster: Cluster number (1 or 2)
+   *   - clusterName: Human-readable name
+   *   - confidence: Confidence percentage (0-100)
+   *   - completeness: Percentage of dimensions provided (0-100)
+   *   - recommendedForm: 'short' or 'long'
+   *   - ldaScore: Position on discriminant axis
+   *   - distanceToBoundary: Distance from cluster boundary
+   */
+  predict(ratings, options = { detailed: true }) {
+    // Convert ratings object to array
+    const values = this.dimensions.map(dim => ratings[dim] ?? null);
+    const providedCount = values.filter(v => v !== null).length;
+    const completeness = providedCount / this.dimensions.length;
+
+    // Fill missing with neutral value (5 = middle of 1-9 scale)
+    const filled = values.map(v => v ?? 5);
+
+    // Standardize
+    const scaled = this._standardize(filled);
+
+    // Calculate LDA score
+    const ldaScore = this._ldaScore(scaled);
+
+    // Predict cluster (LDA boundary at 0)
+    // Positive score = cluster 2 (Institutional)
+    // Negative score = cluster 1 (Relational)
+    const cluster = ldaScore > 0 ? 2 : 1;
+    const clusterName = this.model.cluster_names[cluster];
+
+    // Calculate confidence based on distance from boundary
+    const distanceToBoundary = Math.abs(ldaScore);
+
+    // Confidence: higher when further from boundary
+    // Normalize based on typical strong separation (3.0)
+    let confidence = Math.min(1.0, distanceToBoundary / 3.0);
+
+    // Adjust for completeness
+    const adjustedConfidence = confidence * (0.5 + 0.5 * completeness);
+
+    // Recommend form
+    // Use long form when:
+    // 1. Low confidence (< 0.6)
+    // 2. Low completeness (< 50% of dimensions)
+    // 3. Near boundary (< 0.5 distance)
+    const shouldUseLongForm =
+      adjustedConfidence < this.model.thresholds.confidence_low ||
+      completeness < this.model.thresholds.completeness_low ||
+      distanceToBoundary < this.model.thresholds.boundary_distance_low;
+
+    const recommendedForm = shouldUseLongForm ? 'long' : 'short';
+
+    const basicResult = {
+      cluster,
+      clusterName,
+      confidence: Math.round(adjustedConfidence * 100),
+      completeness: Math.round(completeness * 100),
+      recommendedForm,
+    };
+
+    if (!options.detailed) {
+      return basicResult;
+    }
+
+    // Calculate distances to cluster centroids
+    const filledScaled = scaled.map(v => v ?? 0);
+    const distances = {};
+    for (const [clusterId, centroid] of Object.entries(this.model.cluster_centroids_scaled)) {
+      distances[clusterId] = this._distance(filledScaled, centroid);
+    }
+
+    // Count key dimensions provided
+    const keyDimensionsProvided = this.keyDimensions.filter(
+      dim => ratings[dim] !== null && ratings[dim] !== undefined
+    ).length;
+
+    return {
+      ...basicResult,
+      ldaScore,
+      distanceToBoundary,
+      dimensionsProvided: providedCount,
+      dimensionsTotal: this.dimensions.length,
+      keyDimensionsProvided,
+      keyDimensionsTotal: this.keyDimensions.length,
+      distancesToCentroids: distances,
+      rawConfidence: Math.round(confidence * 100),
+    };
+  }
+
+  /**
+   * Get explanation of classification
+   *
+   * @param {Object} ratings - Dimension ratings
+   * @returns {string} Human-readable explanation
+   */
+  explainClassification(ratings) {
+    const result = this.predict(ratings, { detailed: true });
+    const lines = [];
+
+    lines.push(`Protocol Classification: ${result.clusterName}`);
+    lines.push(`Confidence: ${result.confidence}%`);
+    lines.push('');
+
+    if (result.cluster === 2) {
+      lines.push('This protocol leans toward Institutional/Bureaucratic characteristics:');
+      lines.push('  • More likely to be formal, standardized, top-down');
+      lines.push('  • May involve state/corporate enforcement');
+      lines.push('  • Tends toward precise, documented procedures');
+    } else {
+      lines.push('This protocol leans toward Relational/Cultural characteristics:');
+      lines.push('  • More likely to be emergent, community-based');
+      lines.push('  • May involve voluntary participation');
+      lines.push('  • Tends toward interpretive, flexible practices');
+    }
+
+    lines.push('');
+    lines.push(`Distance from boundary: ${result.distanceToBoundary.toFixed(2)}`);
+
+    if (result.distanceToBoundary < 0.5) {
+      lines.push('⚠️  This protocol is near the boundary between families.');
+      lines.push('   It may exhibit characteristics of both types.');
+    }
+
+    lines.push('');
+    lines.push(`Completeness: ${result.completeness}% (${result.dimensionsProvided}/${result.dimensionsTotal} dimensions)`);
+
+    if (result.completeness < 100) {
+      lines.push('Note: Missing dimensions filled with neutral values (5)');
+      lines.push('      Confidence improves with complete data');
+    }
+
+    lines.push('');
+    lines.push(`Recommended form: ${result.recommendedForm.toUpperCase()}`);
+
+    if (result.recommendedForm === 'long') {
+      lines.push('Reason: Use long form for:');
+      if (result.confidence < 60) {
+        lines.push('  • Low classification confidence');
+      }
+      if (result.completeness < 50) {
+        lines.push('  • Incomplete data');
+      }
+      if (result.distanceToBoundary < 0.5) {
+        lines.push('  • Ambiguous positioning between families');
+      }
+    } else {
+      lines.push(`Reason: High confidence classification with ${result.completeness}% data`);
+    }
+
+    return lines.join('\n');
+  }
+
+  /**
+   * Get the list of key dimensions for short form
+   * @returns {Array<string>} Dimension names
+   */
+  getKeyDimensions() {
+    return [...this.keyDimensions];
+  }
+
+  /**
+   * Check if enough key dimensions are provided for reliable short-form classification
+   * @param {Object} ratings - Current ratings
+   * @returns {Object} Assessment with recommendation
+   */
+  assessShortFormReadiness(ratings) {
+    const keyProvided = this.keyDimensions.filter(
+      dim => ratings[dim] !== null && ratings[dim] !== undefined
+    );
+
+    const coverage = keyProvided.length / this.keyDimensions.length;
+    const isReady = coverage >= 0.75; // 75% of key dimensions
+
+    return {
+      ready: isReady,
+      keyDimensionsProvided: keyProvided.length,
+      keyDimensionsTotal: this.keyDimensions.length,
+      coverage: Math.round(coverage * 100),
+      missingKeyDimensions: this.keyDimensions.filter(
+        dim => !ratings[dim]
+      ),
+    };
+  }
+}
+
+/**
+ * Load model from JSON file
+ *
+ * @param {string} url - URL to bicorder_model.json
+ * @returns {Promise<BicorderClassifier>} Initialized classifier
+ */
+export async function loadClassifier(url = './bicorder_model.json') {
+  const response = await fetch(url);
+  const model = await response.json();
+  return new BicorderClassifier(model);
+}
+
+// Example usage (for testing in Node.js or browser console)
+if (typeof window === 'undefined' && typeof module !== 'undefined') {
+  // Node.js example
+  const fs = require('fs');
+
+  function demo() {
+    const modelData = JSON.parse(fs.readFileSync('bicorder_model.json', 'utf8'));
+    const classifier = new BicorderClassifier(modelData);
+
+    console.log('='.repeat(80));
+    console.log('BICORDER CLASSIFIER - DEMO');
+    console.log('='.repeat(80));
+
+    // Example 1: Community protocol
+    console.log('\nExample 1: Community-Based Protocol');
+    console.log('-'.repeat(80));
+    const communityRatings = {
+      'Design_elite_vs_vernacular': 9,
+      'Design_explicit_vs_implicit': 8,
+      'Entanglement_flocking_vs_swarming': 9,
+      'Entanglement_obligatory_vs_voluntary': 9,
+      'Design_static_vs_malleable': 8,
+    };
+    console.log(classifier.explainClassification(communityRatings));
+
+    // Example 2: Institutional protocol
+    console.log('\n\n' + '='.repeat(80));
+    console.log('Example 2: Institutional Protocol');
+    console.log('-'.repeat(80));
+    const institutionalRatings = {
+      'Design_elite_vs_vernacular': 1,
+      'Design_explicit_vs_implicit': 1,
+      'Entanglement_flocking_vs_swarming': 1,
+      'Entanglement_obligatory_vs_voluntary': 1,
+    };
+    console.log(classifier.explainClassification(institutionalRatings));
+
+    // Example 3: Check short form readiness
+    console.log('\n\n' + '='.repeat(80));
+    console.log('Example 3: Short Form Readiness Assessment');
+    console.log('-'.repeat(80));
+    const partialRatings = {
+      'Design_elite_vs_vernacular': 5,
+      'Entanglement_flocking_vs_swarming': 6,
+    };
+    const assessment = classifier.assessShortFormReadiness(partialRatings);
+    console.log(`Ready for reliable classification: ${assessment.ready}`);
+    console.log(`Key dimensions coverage: ${assessment.coverage}% (${assessment.keyDimensionsProvided}/${assessment.keyDimensionsTotal})`);
+    console.log(`Missing key dimensions: ${assessment.missingKeyDimensions.length}`);
+  }
+
+  if (require.main === module) {
+    demo();
+  }
+}
@@ -0,0 +1,41 @@
+import { BicorderClassifier } from './bicorder-classifier.js';
+import { fileURLToPath } from 'url';
+import path from 'path';
+import fs from 'fs';
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const modelPath = path.join(__dirname, '..', 'bicorder_model.json');
+const modelData = JSON.parse(fs.readFileSync(modelPath, 'utf8'));
+const classifier = new BicorderClassifier(modelData);
+
+console.log('='.repeat(80));
+console.log('BICORDER CLASSIFIER - TEST');
+console.log('='.repeat(80));
+
+// Test 1
+console.log('\nTest 1: Institutional Protocol (e.g., Airport Security)');
+console.log('-'.repeat(80));
+const institutional = {
+  'Design_elite_vs_vernacular': 1,
+  'Design_explicit_vs_implicit': 1,
+  'Entanglement_flocking_vs_swarming': 1,
+  'Entanglement_obligatory_vs_voluntary': 1,
+};
+const result1 = classifier.predict(institutional);
+console.log(JSON.stringify(result1, null, 2));
+
+// Test 2
+console.log('\n\nTest 2: Relational Protocol (e.g., Indigenous Practices)');
+console.log('-'.repeat(80));
+const relational = {
+  'Design_elite_vs_vernacular': 9,
+  'Entanglement_flocking_vs_swarming': 9,
+  'Entanglement_obligatory_vs_voluntary': 9,
+};
+const result2 = classifier.predict(relational);
+console.log(JSON.stringify(result2, null, 2));
+
+console.log('\n\n' + '='.repeat(80));
+console.log('✓ JavaScript classifier working correctly!');
+console.log('  Model size:', Math.round(fs.statSync(modelPath).size / 1024), 'KB');
+console.log('='.repeat(80));