Reorganize directory, add manual dataset and sync tooling

- Move all scripts to scripts/, web assets to web/, analysis results into self-contained data/readings/<type>_<YYYYMMDD>/ directories - Add data/readings/manual_20260320/ with 32 JSON readings from git.medlab.host/ntnsndr/protocol-bicorder-data - Add scripts/json_to_csv.py to convert bicorder JSON files to CSV - Add scripts/sync_readings.sh for one-command sync + re-analysis of any dataset backed by a .sync_source config file - Add scripts/classify_readings.py to apply the LDA classifier to all readings and save per-reading cluster assignments - Add --min-coverage flag to multivariate_analysis.py for sparse/shortform datasets; also applies in lda_visualization.py - Fix lda_visualization.py NaN handling and 0-d array annotation bug - Update README.md and WORKFLOW.md to document datasets, sync workflow, shortform handling, and new scripts Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-20 17:35:13 -06:00
parent 0c794dddae
commit 897c30406b
545 changed files with 10715 additions and 718 deletions
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+"""
+Protocol Bicorder Analysis Script
+
+Processes a two-column CSV file (protocol descriptor and description) and adds
+columns for each diagnostic gradient from bicorder.json. Values to be filled
+by LLM commands.
+"""
+
+import csv
+import json
+import sys
+import argparse
+from pathlib import Path
+
+
+def load_bicorder_config(bicorder_path):
+    """Load and parse the bicorder.json configuration file."""
+    with open(bicorder_path, 'r') as f:
+        return json.load(f)
+
+
+def extract_gradients(bicorder_data):
+    """Extract all gradients from the diagnostic sets."""
+    gradients = []
+    for diagnostic_set in bicorder_data['diagnostic']:
+        set_name = diagnostic_set['set_name']
+
+        for gradient in diagnostic_set['gradients']:
+            # Create a unique column name for this gradient
+            col_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
+            gradients.append({
+                'column_name': col_name,
+                'set_name': set_name,
+                'term_left': gradient['term_left'],
+                'term_left_description': gradient['term_left_description'],
+                'term_right': gradient['term_right'],
+                'term_right_description': gradient['term_right_description']
+            })
+
+    return gradients
+
+
+def process_csv(input_csv, output_csv, bicorder_path, analyst=None, standpoint=None):
+    """
+    Process the input CSV and add gradient columns.
+
+    Args:
+        input_csv: Path to input CSV file
+        output_csv: Path to output CSV file
+        bicorder_path: Path to bicorder.json file
+        analyst: Optional analyst name
+        standpoint: Optional standpoint description
+    """
+    # Load bicorder configuration
+    bicorder_data = load_bicorder_config(bicorder_path)
+    gradients = extract_gradients(bicorder_data)
+
+    with open(input_csv, 'r', encoding='utf-8') as infile, \
+         open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
+
+        reader = csv.DictReader(infile)
+
+        # Get original fieldnames from input CSV, filter out None/empty
+        original_fields = [f for f in reader.fieldnames if f and f.strip()]
+
+        # Add gradient columns and metadata columns
+        gradient_columns = [g['column_name'] for g in gradients]
+        output_fields = list(original_fields) + gradient_columns
+
+        # Add metadata columns if provided
+        if analyst is not None:
+            output_fields.append('analyst')
+        if standpoint is not None:
+            output_fields.append('standpoint')
+
+        writer = csv.DictWriter(outfile, fieldnames=output_fields)
+        writer.writeheader()
+
+        # Process each protocol row
+        row_count = 0
+        for protocol_row in reader:
+            # Start with original row data, filter out None keys
+            output_row = {k: v for k, v in protocol_row.items() if k and k.strip()}
+
+            # Initialize all gradient columns as empty (to be filled by LLM)
+            for gradient in gradients:
+                output_row[gradient['column_name']] = ''
+
+            # Add metadata if provided
+            if analyst is not None:
+                output_row['analyst'] = analyst
+            if standpoint is not None:
+                output_row['standpoint'] = standpoint
+
+            writer.writerow(output_row)
+            row_count += 1
+
+            descriptor = protocol_row.get('Descriptor', '').strip()
+            print(f"Processed protocol {row_count}: {descriptor}")
+
+    print(f"\nOutput written to: {output_csv}")
+    print(f"Total protocols: {row_count}")
+    print(f"Gradient columns added: {len(gradients)}")
+    print(f"\nGradient columns:")
+    for i, gradient in enumerate(gradients, 1):
+        print(f"  {i}. {gradient['column_name']}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Process protocol CSV and add bicorder diagnostic columns',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Example usage:
+  python3 bicorder_analyze.py protocols_edited.csv -o output.csv
+  python3 bicorder_analyze.py protocols_raw.csv -o output.csv -a "Jane Doe" -s "Researcher perspective"
+
+The script will preserve all original columns and add one column per diagnostic gradient.
+Each gradient column will be empty, ready to be filled by LLM commands.
+        """
+    )
+
+    parser.add_argument('input_csv', help='Input CSV file with protocol data')
+    parser.add_argument('-o', '--output', required=True, help='Output CSV file')
+    parser.add_argument('-b', '--bicorder',
+                        default='../bicorder.json',
+                        help='Path to bicorder.json (default: ../bicorder.json)')
+    parser.add_argument('-a', '--analyst', help='Analyst name (adds analyst column)')
+    parser.add_argument('-s', '--standpoint', help='Analyst standpoint (adds standpoint column)')
+
+    args = parser.parse_args()
+
+    # Validate input file exists
+    if not Path(args.input_csv).exists():
+        print(f"Error: Input file '{args.input_csv}' not found", file=sys.stderr)
+        sys.exit(1)
+
+    # Validate bicorder.json exists
+    if not Path(args.bicorder).exists():
+        print(f"Error: Bicorder config '{args.bicorder}' not found", file=sys.stderr)
+        sys.exit(1)
+
+    # Process the CSV
+    process_csv(
+        args.input_csv,
+        args.output,
+        args.bicorder,
+        args.analyst,
+        args.standpoint
+    )
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+"""
+Batch process all protocols in a CSV using the Bicorder framework.
+
+This script orchestrates the entire analysis workflow:
+1. Creates output CSV with gradient columns
+2. For each protocol row:
+   - Queries all 23 gradients (each in a new chat)
+   - Updates CSV with results
+"""
+
+import csv
+import json
+import sys
+import argparse
+import subprocess
+from pathlib import Path
+
+
+def count_csv_rows(csv_path):
+    """Count the number of data rows in a CSV file."""
+    with open(csv_path, 'r', encoding='utf-8') as f:
+        reader = csv.DictReader(f)
+        return sum(1 for _ in reader)
+
+
+def run_bicorder_analyze(input_csv, output_csv, bicorder_path, analyst=None, standpoint=None):
+    """Run bicorder_analyze.py to create output CSV."""
+    cmd = ['python3', str(Path(__file__).parent / 'bicorder_analyze.py'), input_csv, '-o', output_csv, '-b', bicorder_path]
+
+    if analyst:
+        cmd.extend(['-a', analyst])
+    if standpoint:
+        cmd.extend(['-s', standpoint])
+
+    print(f"Creating analysis CSV: {output_csv}")
+    result = subprocess.run(cmd, capture_output=True, text=True)
+
+    if result.returncode != 0:
+        print(f"Error creating CSV: {result.stderr}", file=sys.stderr)
+        return False
+
+    print(result.stdout)
+    return True
+
+
+def query_gradients(output_csv, row_num, bicorder_path, model=None):
+    """Query all gradients for a protocol row."""
+    cmd = ['python3', str(Path(__file__).parent / 'bicorder_query.py'), output_csv, str(row_num),
+           '-b', bicorder_path]
+
+    if model:
+        cmd.extend(['-m', model])
+
+    print(f"Starting gradient queries...")
+
+    # Don't capture output - let it print in real-time for progress visibility
+    result = subprocess.run(cmd)
+
+    if result.returncode != 0:
+        print(f"Error querying gradients", file=sys.stderr)
+        return False
+
+    return True
+
+
+def process_protocol_row(input_csv, output_csv, row_num, total_rows, bicorder_path, model=None):
+    """Process a single protocol row through the complete workflow."""
+    print(f"\n{'='*60}")
+    print(f"Row {row_num}/{total_rows}")
+    print(f"{'='*60}")
+
+    # Query all gradients (each gradient gets a new chat)
+    if not query_gradients(output_csv, row_num, bicorder_path, model):
+        print(f"[FAILED] Could not query gradients")
+        return False
+
+    print(f"✓ Row {row_num} complete")
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Batch process protocols through Bicorder analysis (each gradient uses a new chat)',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Example usage:
+  # Process all protocols
+  python3 bicorder_batch.py data/readings/synthetic_20251116/protocols_edited.csv -o data/readings/synthetic_20251116/readings.csv
+
+  # Process specific rows
+  python3 bicorder_batch.py data/readings/synthetic_20251116/protocols_edited.csv -o data/readings/synthetic_20251116/readings.csv --start 1 --end 5
+
+  # With specific model
+  python3 bicorder_batch.py data/readings/synthetic_20251116/protocols_edited.csv -o data/readings/synthetic_20251116/readings.csv -m mistral
+
+  # With metadata
+  python3 bicorder_batch.py data/readings/synthetic_20251116/protocols_edited.csv -o data/readings/synthetic_20251116/readings.csv -a "Your Name" -s "Your standpoint"
+        """
+    )
+
+    parser.add_argument('input_csv', help='Input CSV file with protocol data')
+    parser.add_argument('-o', '--output', required=True, help='Output CSV file')
+    parser.add_argument('-b', '--bicorder',
+                        default='../bicorder.json',
+                        help='Path to bicorder.json (default: ../bicorder.json)')
+    parser.add_argument('-m', '--model', help='LLM model to use')
+    parser.add_argument('-a', '--analyst', help='Analyst name')
+    parser.add_argument('-s', '--standpoint', help='Analyst standpoint')
+    parser.add_argument('--start', type=int, default=1,
+                        help='Start row number (1-indexed, default: 1)')
+    parser.add_argument('--end', type=int,
+                        help='End row number (1-indexed, default: all rows)')
+    parser.add_argument('--resume', action='store_true',
+                        help='Resume from existing output CSV (skip rows with values)')
+
+    args = parser.parse_args()
+
+    # Validate input file exists
+    if not Path(args.input_csv).exists():
+        print(f"Error: Input file '{args.input_csv}' not found", file=sys.stderr)
+        sys.exit(1)
+
+    # Validate bicorder.json exists
+    if not Path(args.bicorder).exists():
+        print(f"Error: Bicorder config '{args.bicorder}' not found", file=sys.stderr)
+        sys.exit(1)
+
+    # Count rows in input CSV
+    total_rows = count_csv_rows(args.input_csv)
+    end_row = args.end if args.end else total_rows
+
+    if args.start > total_rows or end_row > total_rows:
+        print(f"Error: Row range exceeds CSV size ({total_rows} rows)", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Bicorder Batch Analysis")
+    print(f"Input: {args.input_csv} ({total_rows} protocols)")
+    print(f"Output: {args.output}")
+    print(f"Processing rows: {args.start} to {end_row}")
+    if args.model:
+        print(f"Model: {args.model}")
+    print()
+
+    # Step 1: Create output CSV (unless resuming)
+    if not args.resume or not Path(args.output).exists():
+        if not run_bicorder_analyze(args.input_csv, args.output, args.bicorder,
+                                     args.analyst, args.standpoint):
+            sys.exit(1)
+    else:
+        print(f"Resuming from existing CSV: {args.output}")
+
+    # Step 2: Process each protocol row
+    success_count = 0
+    fail_count = 0
+
+    for row_num in range(args.start, end_row + 1):
+        if process_protocol_row(args.input_csv, args.output, row_num, end_row,
+                                args.bicorder, args.model):
+            success_count += 1
+        else:
+            fail_count += 1
+            print(f"[WARNING] Row {row_num} failed, continuing...")
+
+    # Summary
+    print(f"\n{'='*60}")
+    print(f"BATCH COMPLETE")
+    print(f"{'='*60}")
+    print(f"Successful: {success_count}")
+    print(f"Failed: {fail_count}")
+    print(f"Output: {args.output}")
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,364 @@
+#!/usr/bin/env python3
+"""
+Bicorder Cluster Classifier
+
+Provides real-time protocol classification and smart form recommendation
+based on the two-cluster analysis.
+
+Usage:
+    from bicorder_classifier import BicorderClassifier
+
+    classifier = BicorderClassifier()
+
+    # As user fills in dimensions
+    ratings = {
+        'Design_explicit_vs_implicit': 7,
+        'Design_elite_vs_vernacular': 2,
+        # ... etc
+    }
+
+    result = classifier.predict(ratings)
+    print(f"Cluster: {result['cluster']}")
+    print(f"Confidence: {result['confidence']:.1%}")
+    print(f"Recommend form: {result['recommended_form']}")
+"""
+
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+import json
+from pathlib import Path
+
+# Path to bicorder.json (relative to this script)
+_BICORDER_JSON = Path(__file__).parent.parent.parent / 'bicorder.json'
+
+# Historical column renames: maps old CSV column names → current bicorder.json names.
+# Add an entry here whenever gradient terms are renamed in bicorder.json.
+_COLUMN_RENAMES = {
+    'Design_elite_vs_vernacular': 'Design_institutional_vs_vernacular',
+    'Entanglement_exclusive_vs_non-exclusive': 'Entanglement_monopolistic_vs_pluralistic',
+    'Experience_sufficient_vs_insufficient': 'Experience_sufficient_vs_limited',
+    'Experience_Kafka_vs_Whitehead': 'Experience_restraining_vs_liberating',
+}
+
+
+def _load_bicorder_dimensions(bicorder_path=_BICORDER_JSON):
+    """Read DIMENSIONS and KEY_DIMENSIONS from bicorder.json."""
+    with open(bicorder_path) as f:
+        data = json.load(f)
+    dimensions = []
+    key_dimensions = []
+    for category in data['diagnostic']:
+        set_name = category['set_name']
+        for gradient in category['gradients']:
+            dim_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
+            dimensions.append(dim_name)
+            if gradient.get('shortform', False):
+                key_dimensions.append(dim_name)
+    return dimensions, key_dimensions
+
+
+class BicorderClassifier:
+    """
+    Classifies protocols into one of two families and recommends form type.
+    """
+
+    # Cluster names
+    CLUSTER_NAMES = {
+        1: "Relational/Cultural",
+        2: "Institutional/Bureaucratic"
+    }
+
+    def __init__(self, diagnostic_csv='data/readings/synthetic_20251116/readings.csv',
+                 model_path=None):
+        """Initialize classifier with pre-computed model data."""
+        if model_path is None:
+            model_path = str(Path(diagnostic_csv).parent / 'analysis' / 'data')
+        self._diagnostic_csv = diagnostic_csv
+        self.model_path = Path(model_path)
+        self.scaler = StandardScaler()
+        self.lda = None
+        self.cluster_centroids = None
+
+        # Derive dimension lists from bicorder.json
+        self.DIMENSIONS, self.KEY_DIMENSIONS = _load_bicorder_dimensions()
+
+        # Load training data to fit scaler and LDA
+        self._load_model()
+
+    def _load_model(self):
+        """Load and fit the classification model from analysis results."""
+        # Load the original data and cluster assignments
+        df = pd.read_csv(self._diagnostic_csv)
+        clusters = pd.read_csv(self.model_path / 'kmeans_clusters.csv')
+
+        # Rename old column names to match current bicorder.json
+        df = df.rename(columns=_COLUMN_RENAMES)
+
+        # Remove duplicates
+        df = df.drop_duplicates(subset='Descriptor', keep='first')
+
+        # Merge and clean
+        merged = df.merge(clusters, on='Descriptor')
+        merged_clean = merged.dropna(subset=self.DIMENSIONS)
+
+        # Prepare training data
+        X = merged_clean[self.DIMENSIONS].values
+        y = merged_clean['cluster'].values
+
+        # Fit scaler
+        self.scaler.fit(X)
+        X_scaled = self.scaler.transform(X)
+
+        # Fit LDA
+        self.lda = LinearDiscriminantAnalysis(n_components=1)
+        self.lda.fit(X_scaled, y)
+
+        # Calculate cluster centroids in scaled space
+        self.cluster_centroids = {}
+        for cluster_id in [1, 2]:
+            cluster_data = X_scaled[y == cluster_id]
+            self.cluster_centroids[cluster_id] = cluster_data.mean(axis=0)
+
+    def predict(self, ratings, return_details=True):
+        """
+        Predict cluster for given ratings.
+
+        Args:
+            ratings: Dict mapping dimension names to values (1-9)
+                    Can be partial - missing dimensions are filled with median
+            return_details: If True, returns detailed information
+
+        Returns:
+            Dict with:
+                - cluster: Predicted cluster number (1 or 2)
+                - cluster_name: Human-readable cluster name
+                - confidence: Confidence score (0-1)
+                - completeness: Fraction of dimensions provided (0-1)
+                - recommended_form: 'short' or 'long'
+                - distance_to_boundary: How far from cluster boundary
+                - lda_score: Score on the discriminant axis
+        """
+        # Convert ratings to full vector
+        X = np.full(len(self.DIMENSIONS), np.nan)
+        provided_count = 0
+
+        for i, dim in enumerate(self.DIMENSIONS):
+            if dim in ratings:
+                X[i] = ratings[dim]
+                provided_count += 1
+
+        completeness = provided_count / len(self.DIMENSIONS)
+
+        # Fill missing values with median (5 - middle of 1-9 scale)
+        X[np.isnan(X)] = 5.0
+
+        # Scale
+        X_scaled = self.scaler.transform(X.reshape(1, -1))
+
+        # Predict cluster
+        cluster = self.lda.predict(X_scaled)[0]
+
+        # Get LDA score (position on discriminant axis)
+        lda_score = self.lda.decision_function(X_scaled)[0]
+
+        # Calculate confidence based on distance from decision boundary
+        # LDA decision boundary is at 0
+        distance_to_boundary = abs(lda_score)
+
+        # Confidence: higher when further from boundary
+        # Normalize based on observed data range
+        confidence = min(1.0, distance_to_boundary / 3.0)  # 3.0 is typical strong separation
+
+        # Adjust confidence based on completeness
+        adjusted_confidence = confidence * (0.5 + 0.5 * completeness)
+
+        # Recommend form
+        # Use long form when:
+        # 1. Low confidence (< 0.6)
+        # 2. Low completeness (< 0.5 of dimensions provided)
+        # 3. Near boundary (< 0.5 distance)
+        if adjusted_confidence < 0.6 or completeness < 0.5 or distance_to_boundary < 0.5:
+            recommended_form = 'long'
+        else:
+            recommended_form = 'short'
+
+        if not return_details:
+            return {
+                'cluster': int(cluster),
+                'cluster_name': self.CLUSTER_NAMES[cluster],
+                'confidence': float(adjusted_confidence),
+                'recommended_form': recommended_form
+            }
+
+        # Calculate distances to each centroid
+        distances = {}
+        for cluster_id, centroid in self.cluster_centroids.items():
+            dist = np.linalg.norm(X_scaled - centroid)
+            distances[cluster_id] = float(dist)
+
+        return {
+            'cluster': int(cluster),
+            'cluster_name': self.CLUSTER_NAMES[cluster],
+            'confidence': float(adjusted_confidence),
+            'completeness': float(completeness),
+            'dimensions_provided': provided_count,
+            'dimensions_total': len(self.DIMENSIONS),
+            'recommended_form': recommended_form,
+            'distance_to_boundary': float(distance_to_boundary),
+            'lda_score': float(lda_score),
+            'distances_to_centroids': distances,
+            'key_dimensions_provided': sum(1 for dim in self.KEY_DIMENSIONS if dim in ratings),
+            'key_dimensions_total': len(self.KEY_DIMENSIONS),
+        }
+
+    def get_key_dimensions(self):
+        """Return the most important dimensions for classification."""
+        return self.KEY_DIMENSIONS.copy()
+
+    def get_short_form_dimensions(self):
+        """Return recommended dimensions for short form."""
+        return self.KEY_DIMENSIONS
+
+    def explain_classification(self, ratings):
+        """
+        Provide human-readable explanation of classification.
+
+        Args:
+            ratings: Dict mapping dimension names to values
+
+        Returns:
+            String explanation
+        """
+        result = self.predict(ratings, return_details=True)
+
+        explanation = []
+        explanation.append(f"Protocol Classification: {result['cluster_name']}")
+        explanation.append(f"Confidence: {result['confidence']:.0%}")
+        explanation.append(f"")
+
+        if result['lda_score'] > 0:
+            explanation.append(f"This protocol leans toward Institutional/Bureaucratic characteristics:")
+            explanation.append(f"  - More likely to be formal, standardized, top-down")
+            explanation.append(f"  - May involve state/corporate enforcement")
+            explanation.append(f"  - Tends toward precise, documented procedures")
+        else:
+            explanation.append(f"This protocol leans toward Relational/Cultural characteristics:")
+            explanation.append(f"  - More likely to be emergent, community-based")
+            explanation.append(f"  - May involve voluntary participation")
+            explanation.append(f"  - Tends toward interpretive, flexible practices")
+
+        explanation.append(f"")
+        explanation.append(f"Distance from boundary: {result['distance_to_boundary']:.2f}")
+
+        if result['distance_to_boundary'] < 0.5:
+            explanation.append(f"⚠️  This protocol is near the boundary between families.")
+            explanation.append(f"   It may exhibit characteristics of both types.")
+
+        explanation.append(f"")
+        explanation.append(f"Completeness: {result['completeness']:.0%} ({result['dimensions_provided']}/{result['dimensions_total']} dimensions)")
+
+        if result['completeness'] < 1.0:
+            explanation.append(f"Note: Missing dimensions filled with neutral values (5)")
+            explanation.append(f"      Confidence improves with complete data")
+
+        explanation.append(f"")
+        explanation.append(f"Recommended form: {result['recommended_form'].upper()}")
+
+        if result['recommended_form'] == 'long':
+            explanation.append(f"Reason: Use long form for:")
+            if result['confidence'] < 0.6:
+                explanation.append(f"  - Low classification confidence")
+            if result['completeness'] < 0.5:
+                explanation.append(f"  - Incomplete data")
+            if result['distance_to_boundary'] < 0.5:
+                explanation.append(f"  - Ambiguous positioning between families")
+        else:
+            explanation.append(f"Reason: High confidence classification with {result['completeness']:.0%} data")
+
+        return "\n".join(explanation)
+
+    def save_model(self, output_path='bicorder_classifier_model.json'):
+        """Save model parameters for use without scikit-learn."""
+        model_data = {
+            'dimensions': self.DIMENSIONS,
+            'key_dimensions': self.KEY_DIMENSIONS,
+            'cluster_names': self.CLUSTER_NAMES,
+            'scaler_mean': self.scaler.mean_.tolist(),
+            'scaler_std': self.scaler.scale_.tolist(),
+            'lda_coef': self.lda.coef_.tolist(),
+            'lda_intercept': self.lda.intercept_.tolist(),
+            'cluster_centroids': {
+                str(k): v.tolist() for k, v in self.cluster_centroids.items()
+            }
+        }
+
+        with open(output_path, 'w') as f:
+            json.dump(model_data, f, indent=2)
+
+        print(f"Model saved to {output_path}")
+        return output_path
+
+
+def main():
+    """Demo usage of the classifier."""
+    print("=" * 80)
+    print("BICORDER CLUSTER CLASSIFIER - DEMO")
+    print("=" * 80)
+
+    classifier = BicorderClassifier()
+
+    # Example 1: Relational/Cultural protocol (e.g., Indigenous knowledge sharing)
+    print("\nExample 1: Community-Based Protocol")
+    print("-" * 80)
+    ratings_relational = {
+        'Design_elite_vs_vernacular': 9,  # Very vernacular
+        'Design_explicit_vs_implicit': 8,  # More implicit
+        'Entanglement_flocking_vs_swarming': 9,  # Swarming
+        'Entanglement_obligatory_vs_voluntary': 9,  # Voluntary
+        'Design_static_vs_malleable': 8,  # Malleable
+        'Design_technical_vs_social': 9,  # Social
+    }
+
+    print(classifier.explain_classification(ratings_relational))
+
+    # Example 2: Institutional protocol (e.g., Airport security)
+    print("\n\n" + "=" * 80)
+    print("Example 2: Institutional Protocol")
+    print("-" * 80)
+    ratings_institutional = {
+        'Design_elite_vs_vernacular': 1,  # Elite
+        'Design_explicit_vs_implicit': 1,  # Very explicit
+        'Entanglement_flocking_vs_swarming': 1,  # Flocking
+        'Entanglement_obligatory_vs_voluntary': 1,  # Obligatory
+        'Design_static_vs_malleable': 2,  # Static
+        'Design_technical_vs_social': 2,  # Technical
+        'Entanglement_sovereign_vs_subsidiary': 1,  # Sovereign
+    }
+
+    print(classifier.explain_classification(ratings_institutional))
+
+    # Example 3: Ambiguous/boundary protocol
+    print("\n\n" + "=" * 80)
+    print("Example 3: Boundary Protocol (mixed characteristics)")
+    print("-" * 80)
+    ratings_boundary = {
+        'Design_elite_vs_vernacular': 5,  # Middle
+        'Design_explicit_vs_implicit': 4,  # Slightly implicit
+        'Entanglement_flocking_vs_swarming': 5,  # Middle
+        'Entanglement_obligatory_vs_voluntary': 6,  # Slightly voluntary
+    }
+
+    print(classifier.explain_classification(ratings_boundary))
+
+    # Save model
+    print("\n\n" + "=" * 80)
+    classifier.save_model()
+    print("\nKey dimensions for short form:")
+    for dim in classifier.get_key_dimensions():
+        print(f"  - {dim}")
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+"""
+Initialize LLM conversation with bicorder framework and protocol context.
+
+This script reads a protocol from the CSV and the bicorder.json framework,
+then generates a prompt to initialize the LLM conversation.
+"""
+
+import csv
+import json
+import sys
+import argparse
+from pathlib import Path
+
+
+def load_bicorder_config(bicorder_path):
+    """Load and parse the bicorder.json configuration file."""
+    with open(bicorder_path, 'r') as f:
+        return json.load(f)
+
+
+def get_protocol_by_row(csv_path, row_number):
+    """Get protocol data from CSV by row number (1-indexed)."""
+    with open(csv_path, 'r', encoding='utf-8') as f:
+        reader = csv.DictReader(f)
+        for i, row in enumerate(reader, start=1):
+            if i == row_number:
+                return {
+                    'descriptor': row.get('Descriptor', '').strip(),
+                    'description': row.get('Description', '').strip()
+                }
+    return None
+
+
+def generate_init_prompt(protocol, bicorder_data):
+    """Generate the initialization prompt for the LLM."""
+
+    # Ultra-minimal version for system prompt
+    prompt = f"""Analyze this protocol: "{protocol['descriptor']}"
+
+Description: {protocol['description']}
+
+Task: Rate this protocol on diagnostic gradients using scale 1-9 (1=left term, 5=neutral/balanced, 9=right term). Respond with just the number and brief explanation."""
+
+    return prompt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Initialize LLM conversation with protocol and bicorder framework',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Example usage:
+  # Initialize conversation for protocol in row 1
+  python3 bicorder_init.py protocols_edited.csv 1 | llm -m mistral --save init_1
+
+  # Initialize for row 5
+  python3 bicorder_init.py protocols_edited.csv 5 | llm -m mistral --save init_5
+        """
+    )
+
+    parser.add_argument('input_csv', help='Input CSV file with protocol data')
+    parser.add_argument('row_number', type=int, help='Row number to analyze (1-indexed)')
+    parser.add_argument('-b', '--bicorder',
+                        default='../bicorder.json',
+                        help='Path to bicorder.json (default: ../bicorder.json)')
+
+    args = parser.parse_args()
+
+    # Validate input file exists
+    if not Path(args.input_csv).exists():
+        print(f"Error: Input file '{args.input_csv}' not found", file=sys.stderr)
+        sys.exit(1)
+
+    # Validate bicorder.json exists
+    if not Path(args.bicorder).exists():
+        print(f"Error: Bicorder config '{args.bicorder}' not found", file=sys.stderr)
+        sys.exit(1)
+
+    # Load protocol
+    protocol = get_protocol_by_row(args.input_csv, args.row_number)
+    if protocol is None:
+        print(f"Error: Row {args.row_number} not found in CSV", file=sys.stderr)
+        sys.exit(1)
+
+    # Load bicorder config
+    bicorder_data = load_bicorder_config(args.bicorder)
+
+    # Generate and output prompt
+    prompt = generate_init_prompt(protocol, bicorder_data)
+    print(prompt)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+"""
+Query LLM for individual gradient values and update CSV.
+
+This script generates prompts for each gradient, queries the LLM conversation,
+and updates the CSV with the returned values.
+"""
+
+import csv
+import json
+import sys
+import argparse
+import subprocess
+import re
+from pathlib import Path
+
+
+def load_bicorder_config(bicorder_path):
+    """Load and parse the bicorder.json configuration file."""
+    with open(bicorder_path, 'r') as f:
+        return json.load(f)
+
+
+def extract_gradients(bicorder_data):
+    """Extract all gradients from the diagnostic sets."""
+    gradients = []
+    for diagnostic_set in bicorder_data['diagnostic']:
+        set_name = diagnostic_set['set_name']
+
+        for gradient in diagnostic_set['gradients']:
+            col_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
+            gradients.append({
+                'column_name': col_name,
+                'set_name': set_name,
+                'term_left': gradient['term_left'],
+                'term_left_description': gradient['term_left_description'],
+                'term_right': gradient['term_right'],
+                'term_right_description': gradient['term_right_description']
+            })
+
+    return gradients
+
+
+def get_protocol_by_row(csv_path, row_number):
+    """Get protocol data from CSV by row number (1-indexed)."""
+    with open(csv_path, 'r', encoding='utf-8') as f:
+        reader = csv.DictReader(f)
+        for i, row in enumerate(reader, start=1):
+            if i == row_number:
+                return {
+                    'descriptor': row.get('Descriptor', '').strip(),
+                    'description': row.get('Description', '').strip()
+                }
+    return None
+
+
+def generate_gradient_prompt(protocol_descriptor, protocol_description, gradient):
+    """Generate a prompt for a single gradient evaluation."""
+    return f"""Analyze this protocol: "{protocol_descriptor}"
+
+Description: {protocol_description}
+
+Evaluate the protocol on this gradient:
+
+**{gradient['term_left']}** (1) vs **{gradient['term_right']}** (9)
+
+- **{gradient['term_left']}**: {gradient['term_left_description']}
+- **{gradient['term_right']}**: {gradient['term_right_description']}
+
+Provide a rating from 1 to 9, where:
+- 1 = strongly {gradient['term_left']}
+- 5 = neutral/balanced/not applicable
+- 9 = strongly {gradient['term_right']}
+
+Respond with ONLY the number (1-9), optionally followed by a brief explanation.
+"""
+
+
+def query_llm(prompt, model=None):
+    """Send prompt to llm CLI and get response."""
+    cmd = ['llm']
+    if model:
+        cmd.extend(['-m', model])
+
+    try:
+        result = subprocess.run(
+            cmd,
+            input=prompt,
+            text=True,
+            capture_output=True,
+            check=True
+        )
+        return result.stdout.strip()
+    except subprocess.CalledProcessError as e:
+        print(f"  Error calling llm: {e.stderr}", file=sys.stderr)
+        return None
+
+
+def extract_value(llm_response):
+    """Extract numeric value (1-9) from LLM response."""
+    # Look for a number 1-9 at the start of the response
+    match = re.search(r'^(\d)', llm_response.strip())
+    if match:
+        value = int(match.group(1))
+        if 1 <= value <= 9:
+            return value
+    return None
+
+
+def update_csv_cell(csv_path, row_number, column_name, value):
+    """Update a specific cell in the CSV."""
+    # Read all rows
+    rows = []
+    with open(csv_path, 'r', encoding='utf-8') as f:
+        reader = csv.DictReader(f)
+        fieldnames = reader.fieldnames
+        for row in reader:
+            rows.append(row)
+
+    # Update the specific cell
+    if row_number <= len(rows):
+        rows[row_number - 1][column_name] = str(value)
+
+        # Write back
+        with open(csv_path, 'w', newline='', encoding='utf-8') as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(rows)
+        return True
+    return False
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Query LLM for gradient values and update CSV',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Example usage:
+  # Query all gradients for protocol in row 1
+  python3 bicorder_query.py analysis_output.csv 1
+
+  # Query specific model
+  python3 bicorder_query.py analysis_output.csv 1 -m mistral
+
+  # Dry run (show prompts without calling LLM)
+  python3 bicorder_query.py analysis_output.csv 1 --dry-run
+        """
+    )
+
+    parser.add_argument('csv_path', help='CSV file to update')
+    parser.add_argument('row_number', type=int, help='Row number to analyze (1-indexed)')
+    parser.add_argument('-b', '--bicorder',
+                        default='../bicorder.json',
+                        help='Path to bicorder.json (default: ../bicorder.json)')
+    parser.add_argument('-m', '--model', help='LLM model to use')
+    parser.add_argument('--dry-run', action='store_true',
+                        help='Show prompts without calling LLM or updating CSV')
+
+    args = parser.parse_args()
+
+    # Validate files exist
+    if not Path(args.csv_path).exists():
+        print(f"Error: CSV file '{args.csv_path}' not found", file=sys.stderr)
+        sys.exit(1)
+
+    if not Path(args.bicorder).exists():
+        print(f"Error: Bicorder config '{args.bicorder}' not found", file=sys.stderr)
+        sys.exit(1)
+
+    # Load protocol data
+    protocol = get_protocol_by_row(args.csv_path, args.row_number)
+    if protocol is None:
+        print(f"Error: Row {args.row_number} not found in CSV", file=sys.stderr)
+        sys.exit(1)
+
+    # Load bicorder config
+    bicorder_data = load_bicorder_config(args.bicorder)
+    gradients = extract_gradients(bicorder_data)
+
+    if args.dry_run:
+        print(f"DRY RUN: Row {args.row_number}, {len(gradients)} gradients")
+        print(f"Protocol: {protocol['descriptor']}\n")
+    else:
+        print(f"Protocol: {protocol['descriptor']}")
+        print(f"Loaded {len(gradients)} gradients, starting queries...")
+
+    # Process each gradient
+    for i, gradient in enumerate(gradients, 1):
+        gradient_short = gradient['column_name'].replace('_', ' ')
+
+        if not args.dry_run:
+            print(f"[{i}/{len(gradients)}] Querying: {gradient_short}...", flush=True)
+
+        # Generate prompt (including protocol context)
+        prompt = generate_gradient_prompt(
+            protocol['descriptor'],
+            protocol['description'],
+            gradient
+        )
+
+        if args.dry_run:
+            print(f"[{i}/{len(gradients)}] {gradient_short}")
+            print(f"Prompt:\n{prompt}\n")
+            continue
+
+        # Query LLM (new chat each time)
+        response = query_llm(prompt, args.model)
+
+        if response is None:
+            print(f"[{i}/{len(gradients)}] {gradient_short}: FAILED")
+            continue
+
+        # Extract value
+        value = extract_value(response)
+        if value is None:
+            print(f"[{i}/{len(gradients)}] {gradient_short}: WARNING - no valid value")
+            continue
+
+        # Update CSV
+        if update_csv_cell(args.csv_path, args.row_number, gradient['column_name'], value):
+            print(f"[{i}/{len(gradients)}] {gradient_short}: {value}")
+        else:
+            print(f"[{i}/{len(gradients)}] {gradient_short}: ERROR updating CSV")
+
+    if not args.dry_run:
+        print(f"\n✓ CSV updated: {args.csv_path}")
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+prompt="Return csv-formatted data (with no markdown wrapper) that consists of a list of protocols discussed or referred to in the attached text. Protocols are defined extremely broadly as 'patterns of interaction,' and may be of a nontechnical nature. Protocols should be as specific as possible, such as 'Sacrament of Reconciliation' rather than 'Religious Protocols.' The first column should provide a brief descriptor of the protocol, and the second column should describe it in a substantial paragraph of 3-5 sentences, encapsulated in quotation marks to avoid breaking on commas. Be sure to paraphrase rather than quoting directly from the source text."
+
+for file in "$@"; do
+    llm -m gemma3:12b -f $file "$prompt" >> output.csv
+    echo "Completed $file"
+done
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""
+Apply the BicorderClassifier to all readings in a CSV and save results.
+
+Uses the synthetic-trained LDA model by default. Missing dimensions are
+filled with the neutral value (5), so shortform readings can still be
+classified — though with lower confidence.
+
+Usage:
+  python3 scripts/classify_readings.py data/readings/manual_20260320/readings.csv
+  python3 scripts/classify_readings.py data/readings/manual_20260320/readings.csv \\
+      --training data/readings/synthetic_20251116/readings.csv \\
+      --output data/readings/manual_20260320/analysis/classifications.csv
+"""
+
+import argparse
+import csv
+from pathlib import Path
+
+import pandas as pd
+
+from bicorder_classifier import BicorderClassifier
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Classify all readings in a CSV using the BicorderClassifier'
+    )
+    parser.add_argument('input_csv', help='Readings CSV to classify')
+    parser.add_argument(
+        '--training',
+        default='data/readings/synthetic_20251116/readings.csv',
+        help='Training CSV for classifier (default: synthetic_20251116)'
+    )
+    parser.add_argument(
+        '--output', default=None,
+        help='Output CSV path (default: <dataset>/analysis/classifications.csv)'
+    )
+    args = parser.parse_args()
+
+    input_path = Path(args.input_csv)
+    output_path = (
+        Path(args.output) if args.output
+        else input_path.parent / 'analysis' / 'classifications.csv'
+    )
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    print(f"Loading classifier (training: {args.training})...")
+    classifier = BicorderClassifier(diagnostic_csv=args.training)
+
+    df = pd.read_csv(input_path)
+    print(f"Classifying {len(df)} readings from {input_path}...")
+
+    rows = []
+    for _, record in df.iterrows():
+        # Build ratings dict from dimension columns only
+        ratings = {
+            col: float(record[col])
+            for col in classifier.DIMENSIONS
+            if col in record and pd.notna(record[col])
+        }
+
+        result = classifier.predict(ratings, return_details=True)
+
+        rows.append({
+            'Descriptor': record.get('Descriptor', ''),
+            'analyst': record.get('analyst', ''),
+            'standpoint': record.get('standpoint', ''),
+            'shortform': record.get('shortform', ''),
+            'cluster': result['cluster'],
+            'cluster_name': result['cluster_name'],
+            'confidence': round(result['confidence'], 3),
+            'lda_score': round(result['lda_score'], 3),
+            'distance_to_boundary': round(result['distance_to_boundary'], 3),
+            'completeness': round(result['completeness'], 3),
+            'dimensions_provided': result['dimensions_provided'],
+            'key_dims_provided': result['key_dimensions_provided'],
+            'recommended_form': result['recommended_form'],
+        })
+
+    out_df = pd.DataFrame(rows)
+    out_df.to_csv(output_path, index=False)
+    print(f"Classifications saved → {output_path}")
+
+    # Summary
+    counts = out_df['cluster_name'].value_counts()
+    print(f"\nCluster summary:")
+    for name, count in counts.items():
+        pct = count / len(out_df) * 100
+        print(f"  {name}: {count} ({pct:.0f}%)")
+
+    low_conf = (out_df['confidence'] < 0.4).sum()
+    if low_conf:
+        print(f"\n  {low_conf} readings with low confidence (<0.4) — may be boundary cases")
+
+    shortform_count = out_df[out_df['shortform'].astype(str) == 'True'].shape[0]
+    if shortform_count:
+        print(f"\n  {shortform_count} shortform readings classified (missing dims filled with neutral 5)")
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+"""
+Compare multiple analysis CSV files to determine which most closely resembles a reference file.
+Uses Euclidean distance, correlation, and RMSE metrics.
+"""
+
+import pandas as pd
+import numpy as np
+from scipy.stats import pearsonr
+from pathlib import Path
+
+def calculate_euclidean_distance(df1, df2, numeric_cols):
+    """Calculate Euclidean distance between two dataframes."""
+    distances = []
+    for idx in df1.index:
+        diff = df1.loc[idx, numeric_cols] - df2.loc[idx, numeric_cols]
+        # Use nansum to ignore NaN values
+        distance = np.sqrt(np.nansum(diff ** 2))
+        distances.append(distance)
+    return np.array(distances)
+
+def calculate_rmse(df1, df2, numeric_cols):
+    """Calculate Root Mean Squared Error."""
+    diff = df1[numeric_cols] - df2[numeric_cols]
+    # Use nanmean to ignore NaN values
+    mse = np.nanmean(diff.values ** 2)
+    return np.sqrt(mse)
+
+def calculate_correlation(df1, df2, numeric_cols):
+    """Calculate Pearson correlation across all numeric values."""
+    vals1 = df1[numeric_cols].values.flatten()
+    vals2 = df2[numeric_cols].values.flatten()
+
+    # Remove NaN values - only use positions where both have valid values
+    mask = ~(np.isnan(vals1) | np.isnan(vals2))
+    vals1_clean = vals1[mask]
+    vals2_clean = vals2[mask]
+
+    if len(vals1_clean) < 2:
+        return np.nan, np.nan
+
+    corr, pvalue = pearsonr(vals1_clean, vals2_clean)
+    return corr, pvalue
+
+def compare_analyses(reference_file, comparison_files):
+    """Compare multiple analysis files to a reference file."""
+
+    # Read reference file
+    print(f"Reading reference file: {reference_file}")
+    ref_df = pd.read_csv(reference_file, quotechar='"', escapechar='\\', engine='python')
+    # Get numeric columns (all the rating dimensions)
+    numeric_cols = [col for col in ref_df.columns if
+                   col.startswith(('Design_', 'Entanglement_', 'Experience_'))]
+
+    # Convert numeric columns to numeric type, coercing errors to NaN
+    for col in numeric_cols:
+        ref_df[col] = pd.to_numeric(ref_df[col], errors='coerce')
+
+    print(f"\nFound {len(numeric_cols)} numeric dimensions to compare")
+    print(f"Comparing {len(ref_df)} protocols\n")
+    print("="*80)
+
+    results = {}
+
+    for comp_file in comparison_files:
+        print(f"\nComparing: {Path(comp_file).name}")
+        print("-"*80)
+
+        # Read comparison file
+        comp_df = pd.read_csv(comp_file, quotechar='"', escapechar='\\', engine='python')
+
+        # Convert numeric columns to numeric type, coercing errors to NaN
+        for col in numeric_cols:
+            comp_df[col] = pd.to_numeric(comp_df[col], errors='coerce')
+
+        # Ensure same protocols in same order (match by Descriptor)
+        if 'Descriptor' in ref_df.columns and 'Descriptor' in comp_df.columns:
+            # Use merge to ensure exact matching - only keep protocols in ref_df
+            comp_df = pd.merge(
+                ref_df[['Descriptor']],
+                comp_df,
+                on='Descriptor',
+                how='left'
+            )
+
+        # Calculate Euclidean distances using reset indices to ensure alignment
+        ref_temp = ref_df.reset_index(drop=True)
+        comp_temp = comp_df.reset_index(drop=True)
+        euclidean_distances = calculate_euclidean_distance(ref_temp, comp_temp, numeric_cols)
+        total_euclidean = np.sum(euclidean_distances)
+        avg_euclidean = np.mean(euclidean_distances)
+
+        # Calculate RMSE
+        rmse = calculate_rmse(ref_temp, comp_temp, numeric_cols)
+
+        # Calculate correlation
+        correlation, p_value = calculate_correlation(ref_temp, comp_temp, numeric_cols)
+
+        # Store results
+        results[Path(comp_file).name] = {
+            'total_euclidean': total_euclidean,
+            'avg_euclidean': avg_euclidean,
+            'rmse': rmse,
+            'correlation': correlation,
+            'p_value': p_value,
+            'per_protocol_distances': euclidean_distances,
+            'protocols': ref_df['Descriptor'].values if 'Descriptor' in ref_df.columns else None
+        }
+
+        # Print results
+        print(f"  Total Euclidean Distance:   {total_euclidean:.2f}")
+        print(f"  Average Euclidean Distance: {avg_euclidean:.2f}")
+        print(f"  RMSE:                       {rmse:.2f}")
+        print(f"  Pearson Correlation:        {correlation:.4f} (p={p_value:.2e})")
+
+    # Summary comparison
+    print("\n" + "="*80)
+    print("SUMMARY RANKING (lower distance = more similar)")
+    print("="*80)
+
+    # Sort by average Euclidean distance
+    sorted_by_euclidean = sorted(results.items(), key=lambda x: x[1]['avg_euclidean'])
+
+    print("\nBy Average Euclidean Distance:")
+    for i, (name, data) in enumerate(sorted_by_euclidean, 1):
+        print(f"  {i}. {name:30s} - Avg Distance: {data['avg_euclidean']:.2f}")
+
+    # Sort by correlation (higher is better)
+    sorted_by_corr = sorted(results.items(), key=lambda x: x[1]['correlation'], reverse=True)
+
+    print("\nBy Correlation (higher = more similar):")
+    for i, (name, data) in enumerate(sorted_by_corr, 1):
+        print(f"  {i}. {name:30s} - Correlation: {data['correlation']:.4f}")
+
+    # Sort by RMSE
+    sorted_by_rmse = sorted(results.items(), key=lambda x: x[1]['rmse'])
+
+    print("\nBy RMSE (lower = more similar):")
+    for i, (name, data) in enumerate(sorted_by_rmse, 1):
+        print(f"  {i}. {name:30s} - RMSE: {data['rmse']:.2f}")
+
+    # Show protocols with largest differences for the best match
+    print("\n" + "="*80)
+    best_match_name, best_match_data = sorted_by_euclidean[0]
+    print(f"Top 10 protocols with largest differences from {best_match_name}:")
+    print("="*80)
+
+    if best_match_data['protocols'] is not None:
+        distances = best_match_data['per_protocol_distances']
+        protocols = best_match_data['protocols']
+        top_diff_indices = np.argsort(distances)[-10:][::-1]
+
+        for idx in top_diff_indices:
+            print(f"  {protocols[idx]:50s} - Distance: {distances[idx]:.2f}")
+
+    return results
+
+if __name__ == "__main__":
+    # Define file paths
+    reference_file = "data/readings/synthetic_20251116/readings_manual.csv"
+    comparison_files = [
+        "data/readings/synthetic_20251116/readings_gemma3-12b.csv",
+        "data/readings/synthetic_20251116/readings_gpt-oss.csv",
+        "data/readings/synthetic_20251116/readings_mistral.csv"
+    ]
+
+    # Check if files exist
+    if not Path(reference_file).exists():
+        print(f"Error: Reference file '{reference_file}' not found")
+        exit(1)
+
+    for file in comparison_files:
+        if not Path(file).exists():
+            print(f"Warning: Comparison file '{file}' not found, skipping...")
+            comparison_files.remove(file)
+
+    if not comparison_files:
+        print("Error: No comparison files found")
+        exit(1)
+
+    # Run comparison
+    results = compare_analyses(reference_file, comparison_files)
+
+    print("\n" + "="*80)
+    print("Analysis complete!")
+    print("="*80)
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+"""
+Convert diagnostic_output.csv to individual JSON files following the bicorder.json spec.
+Handles mapping between old CSV column names and current spec terminology.
+"""
+
+import csv
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Any, Optional
+import statistics
+
+
+# Mapping from CSV columns to spec terms
+# Format: (csv_column_suffix, set_name, term_left, term_right)
+GRADIENT_MAPPINGS = [
+    # Design set
+    ("explicit_vs_implicit", "Design", "explicit", "implicit"),
+    ("precise_vs_interpretive", "Design", "precise", "interpretive"),
+    ("elite_vs_vernacular", "Design", "institutional", "vernacular"),  # Changed: elite → institutional
+    ("documenting_vs_enabling", "Design", "documenting", "enabling"),
+    ("static_vs_malleable", "Design", "static", "malleable"),
+    ("technical_vs_social", "Design", "technical", "social"),
+    ("universal_vs_particular", "Design", "universal", "particular"),
+    ("durable_vs_ephemeral", "Design", "durable", "ephemeral"),
+
+    # Entanglement set
+    ("macro_vs_micro", "Entanglement", "macro", "micro"),
+    ("sovereign_vs_subsidiary", "Entanglement", "sovereign", "subsidiary"),
+    ("self-enforcing_vs_enforced", "Entanglement", "self-enforcing", "enforced"),
+    ("abstract_vs_embodied", "Entanglement", "abstract", "embodied"),
+    ("obligatory_vs_voluntary", "Entanglement", "obligatory", "voluntary"),
+    ("flocking_vs_swarming", "Entanglement", "flocking", "swarming"),
+    ("defensible_vs_exposed", "Entanglement", "defensible", "exposed"),
+    ("exclusive_vs_non-exclusive", "Entanglement", "monopolistic", "pluralistic"),  # Changed: exclusive → monopolistic
+
+    # Experience set
+    ("sufficient_vs_insufficient", "Experience", "sufficient", "limited"),  # Changed: insufficient → limited
+    ("crystallized_vs_contested", "Experience", "crystallized", "contested"),
+    ("trust-evading_vs_trust-inducing", "Experience", "trust-evading", "trust-inducing"),
+    ("predictable_vs_emergent", "Experience", "predictable", "emergent"),
+    ("exclusion_vs_inclusion", "Experience", "exclusion", "inclusion"),
+    ("Kafka_vs_Whitehead", "Experience", "restraining", "liberating"),  # Changed: Kafka_vs_Whitehead → restraining_vs_liberating
+    ("dead_vs_alive", "Experience", "dead", "alive"),
+]
+
+
+def load_spec_template(spec_path: str) -> Dict[str, Any]:
+    """Load the bicorder.json spec as a template."""
+    with open(spec_path, 'r') as f:
+        return json.load(f)
+
+
+def calculate_hardness(gradient_values: List[Optional[int]]) -> Optional[int]:
+    """
+    Calculate hardness: mean of all gradient values, rounded to nearest integer.
+    Returns None if there are no valid values.
+    """
+    valid_values = [v for v in gradient_values if v is not None]
+    if not valid_values:
+        return None
+    return round(statistics.mean(valid_values))
+
+
+def calculate_polarization(gradient_values: List[Optional[int]]) -> Optional[int]:
+    """
+    Calculate polarization: degree to which values are extreme vs centered.
+    If all values are 1 or 9 (max polarization), return 1.
+    If all values are 5 (centered), return 9.
+    Returns None if there are no valid values.
+    """
+    valid_values = [v for v in gradient_values if v is not None]
+    if not valid_values:
+        return None
+
+    # Calculate average distance from center (5)
+    distances = [abs(v - 5) for v in valid_values]
+    avg_distance = statistics.mean(distances)
+
+    # Max distance is 4 (from 1 or 9 to 5)
+    # Convert to scale where 4 → 1 (polarized) and 0 → 9 (centrist)
+    # Linear mapping: polarization = 9 - (avg_distance / 4) * 8
+    polarization = 9 - (avg_distance / 4) * 8
+
+    return round(polarization)
+
+
+def create_json_from_row(row: Dict[str, str], template: Dict[str, Any]) -> Dict[str, Any]:
+    """Convert a CSV row to a JSON object following the spec."""
+    result = json.loads(json.dumps(template))  # Deep copy
+
+    # Update metadata
+    result["metadata"]["protocol"] = row["Descriptor"]
+    result["metadata"]["description"] = row["Description"]
+    result["metadata"]["analyst"] = row["analyst"]
+    result["metadata"]["standpoint"] = row["standpoint"]
+    result["metadata"]["timestamp"] = None  # Not in CSV
+
+    # Collect gradient values for analysis calculations
+    gradient_values = []
+
+    # Map CSV values to gradient objects
+    for csv_suffix, set_name, term_left, term_right in GRADIENT_MAPPINGS:
+        csv_column = f"{set_name}_{csv_suffix}"
+
+        # Get the value from CSV (may be empty string)
+        csv_value = row.get(csv_column, "").strip()
+        value = int(csv_value) if csv_value else None
+
+        if value is not None:
+            gradient_values.append(value)
+
+        # Find the corresponding gradient in the template
+        for diagnostic_set in result["diagnostic"]:
+            if diagnostic_set["set_name"] == set_name:
+                for gradient in diagnostic_set["gradients"]:
+                    if gradient["term_left"] == term_left and gradient["term_right"] == term_right:
+                        gradient["value"] = value
+                        break
+
+    # Calculate automated analysis fields
+    result["analysis"][0]["value"] = calculate_hardness(gradient_values)  # hardness
+    result["analysis"][1]["value"] = calculate_polarization(gradient_values)  # polarized
+    # analysis[2] is bureaucratic (LDA-based) - leave as null
+    # analysis[3] is usefulness - leave as null (not automated)
+
+    return result
+
+
+def main():
+    """Main conversion process."""
+    import argparse
+    parser = argparse.ArgumentParser(
+        description='Convert diagnostic readings CSV to individual JSON files',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Example usage:
+  python3 scripts/convert_csv_to_json.py data/readings/synthetic_20251116/readings.csv
+  python3 scripts/convert_csv_to_json.py data/readings/manual_20260101/readings.csv --output-dir data/readings/manual_20260101/json
+        """
+    )
+    parser.add_argument('input_csv', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
+    parser.add_argument('--output-dir', default=None,
+                        help='Output directory for JSON files (default: <dataset_dir>/json)')
+    parser.add_argument('--bicorder', default='../bicorder.json',
+                        help='Path to bicorder.json (default: ../bicorder.json)')
+    args = parser.parse_args()
+
+    csv_path = args.input_csv
+    spec_path = args.bicorder
+    output_dir = args.output_dir if args.output_dir else str(Path(args.input_csv).parent / 'json')
+
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Load template
+    template = load_spec_template(spec_path)
+
+    # Process CSV
+    with open(csv_path, 'r', encoding='utf-8') as csvfile:
+        reader = csv.DictReader(csvfile)
+
+        count = 0
+        for i, row in enumerate(reader, start=1):
+            # Create JSON object
+            json_obj = create_json_from_row(row, template)
+
+            # Generate filename from protocol name
+            protocol_name = row["Descriptor"]
+            # Sanitize filename
+            filename = protocol_name.replace("/", "_").replace("\\", "_")
+            filename = f"{i:03d}_{filename}.json"
+
+            # Write to file
+            output_path = os.path.join(output_dir, filename)
+            with open(output_path, 'w', encoding='utf-8') as jsonfile:
+                json.dump(json_obj, jsonfile, indent=2)
+
+            count += 1
+            if count % 50 == 0:
+                print(f"Processed {count} protocols...")
+
+        print(f"\nConversion complete! Created {count} JSON files in {output_dir}/")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+"""
+Export the cluster classification model to JSON for use in JavaScript.
+
+Reads dimension names directly from bicorder.json so the model always
+stays in sync with the current bicorder structure.
+
+When gradients are renamed in bicorder.json, add the old→new mapping to
+COLUMN_RENAMES so the training CSV columns are correctly aligned.
+
+Usage:
+  python3 scripts/export_model_for_js.py data/readings/synthetic_20251116/readings.csv
+  python3 scripts/export_model_for_js.py data/readings/manual_20260101/readings.csv --output bicorder_model.json
+"""
+
+import argparse
+import json
+from pathlib import Path
+
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+
+# Path to bicorder.json (relative to this script)
+BICORDER_JSON = Path(__file__).parent.parent.parent / 'bicorder.json'
+
+# Historical column renames: maps old CSV column names → current bicorder.json names.
+# Add an entry here whenever gradient terms are renamed in bicorder.json.
+COLUMN_RENAMES = {
+    'Design_elite_vs_vernacular': 'Design_institutional_vs_vernacular',
+    'Entanglement_exclusive_vs_non-exclusive': 'Entanglement_monopolistic_vs_pluralistic',
+    'Experience_sufficient_vs_insufficient': 'Experience_sufficient_vs_limited',
+    'Experience_Kafka_vs_Whitehead': 'Experience_restraining_vs_liberating',
+}
+
+
+def load_bicorder_dimensions(bicorder_path):
+    """Read DIMENSIONS and KEY_DIMENSIONS from bicorder.json."""
+    with open(bicorder_path) as f:
+        data = json.load(f)
+
+    dimensions = []
+    key_dimensions = []
+
+    for category in data['diagnostic']:
+        set_name = category['set_name']
+        for gradient in category['gradients']:
+            dim_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
+            dimensions.append(dim_name)
+            if gradient.get('shortform', False):
+                key_dimensions.append(dim_name)
+
+    return dimensions, key_dimensions, data['version']
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Export cluster classification model to JSON for JavaScript',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Example usage:
+  python3 scripts/export_model_for_js.py data/readings/synthetic_20251116/readings.csv
+  python3 scripts/export_model_for_js.py data/readings/manual_20260101/readings.csv --output bicorder_model.json
+        """
+    )
+    parser.add_argument('input_csv', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
+    parser.add_argument('--output', default='bicorder_model.json',
+                        help='Output model JSON path (default: bicorder_model.json)')
+    args = parser.parse_args()
+
+    dataset_dir = Path(args.input_csv).parent
+    analysis_dir = dataset_dir / 'analysis'
+
+    # Derive dimensions and version from bicorder.json
+    DIMENSIONS, KEY_DIMENSIONS, BICORDER_VERSION = load_bicorder_dimensions(BICORDER_JSON)
+
+    print(f"Loaded bicorder.json v{BICORDER_VERSION}")
+    print(f"Dimensions: {len(DIMENSIONS)}, key dimensions: {len(KEY_DIMENSIONS)}")
+
+    # Load data
+    df = pd.read_csv(args.input_csv)
+    clusters = pd.read_csv(analysis_dir / 'data' / 'kmeans_clusters.csv')
+
+    # Rename old column names to match current bicorder.json
+    df = df.rename(columns=COLUMN_RENAMES)
+
+    # Remove duplicates
+    df = df.drop_duplicates(subset='Descriptor', keep='first')
+
+    # Merge and clean
+    merged = df.merge(clusters, on='Descriptor')
+    merged_clean = merged.dropna(subset=DIMENSIONS)
+
+    print(f"Training on {len(merged_clean)} protocols")
+
+    # Prepare training data
+    X = merged_clean[DIMENSIONS].values
+    y = merged_clean['cluster'].values
+
+    # Fit scaler
+    scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(X)
+
+    # Fit LDA
+    lda = LinearDiscriminantAnalysis(n_components=1)
+    lda.fit(X_scaled, y)
+
+    # Calculate cluster centroids in scaled space
+    cluster_centroids = {}
+    for cluster_id in [1, 2]:
+        cluster_data = X_scaled[y == cluster_id]
+        cluster_centroids[cluster_id] = cluster_data.mean(axis=0).tolist()
+
+    # Calculate cluster means in original space (for reference)
+    cluster_means_original = {}
+    for cluster_id in [1, 2]:
+        cluster_data_original = X[y == cluster_id]
+        cluster_means_original[cluster_id] = cluster_data_original.mean(axis=0).tolist()
+
+    # Build model export
+    model = {
+        'version': '1.0',
+        'bicorder_version': BICORDER_VERSION,
+        'generated': pd.Timestamp.now().isoformat(),
+        'dimensions': DIMENSIONS,
+        'key_dimensions': KEY_DIMENSIONS,
+        'cluster_names': {
+            '1': 'Relational/Cultural',
+            '2': 'Institutional/Bureaucratic'
+        },
+        'cluster_descriptions': {
+            '1': 'Community-based, emergent, voluntary, cultural protocols',
+            '2': 'Formal, institutional, top-down, bureaucratic protocols'
+        },
+        'scaler': {
+            'mean': scaler.mean_.tolist(),
+            'scale': scaler.scale_.tolist()
+        },
+        'lda': {
+            'coefficients': lda.coef_[0].tolist(),
+            'intercept': lda.intercept_[0]
+        },
+        'cluster_centroids_scaled': cluster_centroids,
+        'cluster_means_original': cluster_means_original,
+        'thresholds': {
+            'confidence_low': 0.6,
+            'completeness_low': 0.5,
+            'boundary_distance_low': 0.5
+        },
+        'metadata': {
+            'total_protocols': len(merged_clean),
+            'cluster_1_count': int((y == 1).sum()),
+            'cluster_2_count': int((y == 2).sum()),
+        }
+    }
+
+    # Save to JSON
+    with open(args.output, 'w') as f:
+        json.dump(model, f, indent=2)
+
+    print(f"\nModel exported to {args.output}")
+    print(f"Bicorder version: {BICORDER_VERSION}")
+    print(f"Total dimensions: {len(DIMENSIONS)}")
+    print(f"Key dimensions (short form):")
+    for dim in KEY_DIMENSIONS:
+        print(f"  - {dim}")
+    print(f"Model size: {len(json.dumps(model))} bytes")
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+"""
+Convert a directory of individual bicorder JSON reading files into a diagnostic CSV.
+
+This is the reverse of convert_csv_to_json.py. Each JSON file becomes one row.
+Handles readings across bicorder versions by matching on term_left/term_right pairs
+rather than column names.
+
+Null gradient values (e.g., shortform readings that skip non-key dimensions) are
+written as empty cells so downstream analysis can treat them as NaN.
+
+Usage:
+  python3 scripts/json_to_csv.py data/readings/manual_20260320/json/
+  python3 scripts/json_to_csv.py data/readings/manual_20260320/json/ -o data/readings/manual_20260320/readings.csv
+  python3 scripts/json_to_csv.py data/readings/manual_20260320/json/ --shortform-only
+"""
+
+import argparse
+import csv
+import json
+from pathlib import Path
+
+
+# Map old term pairs to current column names (matches COLUMN_RENAMES in other scripts).
+# Keys are (term_left, term_right) as found in older JSON files.
+TERM_RENAMES = {
+    ('elite', 'vernacular'): ('institutional', 'vernacular'),
+    ('exclusive', 'non-exclusive'): ('monopolistic', 'pluralistic'),
+    ('insufficient', 'sufficient'): ('sufficient', 'limited'),  # note: order swapped in old versions
+    ('Kafka', 'Whitehead'): ('restraining', 'liberating'),
+}
+
+
+def load_bicorder_columns(bicorder_path):
+    """Read ordered column definitions from bicorder.json."""
+    with open(bicorder_path) as f:
+        data = json.load(f)
+    columns = []
+    key_columns = set()
+    for category in data['diagnostic']:
+        set_name = category['set_name']
+        for gradient in category['gradients']:
+            col = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
+            columns.append(col)
+            if gradient.get('shortform', False):
+                key_columns.add(col)
+    return columns, key_columns
+
+
+def normalize_terms(term_left, term_right):
+    """Apply renames to match current bicorder.json terminology."""
+    pair = (term_left, term_right)
+    if pair in TERM_RENAMES:
+        return TERM_RENAMES[pair]
+    # Also check reversed pair (some old files had swapped left/right)
+    reversed_pair = (term_right, term_left)
+    if reversed_pair in TERM_RENAMES:
+        new_left, new_right = TERM_RENAMES[reversed_pair]
+        return new_right, new_left  # swap back
+    return term_left, term_right
+
+
+def json_to_row(json_path, all_columns):
+    """Convert a single JSON reading file to a CSV row dict."""
+    with open(json_path) as f:
+        data = json.load(f)
+
+    meta = data.get('metadata', {})
+    row = {
+        'Descriptor': meta.get('protocol', ''),
+        'Description': '',  # not stored in individual reading files
+        'analyst': meta.get('analyst', ''),
+        'standpoint': meta.get('standpoint', ''),
+        'timestamp': meta.get('timestamp', ''),
+        'shortform': str(meta.get('shortform', '')),
+        'version': data.get('version', ''),
+    }
+
+    # Build lookup: (normalized_term_left, normalized_term_right) -> value
+    gradient_values = {}
+    for category in data.get('diagnostic', []):
+        set_name = category['set_name']
+        for gradient in category.get('gradients', []):
+            tl = gradient['term_left']
+            tr = gradient['term_right']
+            tl_norm, tr_norm = normalize_terms(tl, tr)
+            col = f"{set_name}_{tl_norm}_vs_{tr_norm}"
+            value = gradient.get('value')
+            gradient_values[col] = '' if value is None else str(value)
+
+    for col in all_columns:
+        row[col] = gradient_values.get(col, '')
+
+    return row
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert directory of bicorder JSON files to a diagnostic CSV',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Example usage:
+  python3 scripts/json_to_csv.py data/readings/manual_20260320/json/
+  python3 scripts/json_to_csv.py data/readings/manual_20260320/json/ --shortform-only
+  python3 scripts/json_to_csv.py data/readings/manual_20260320/json/ -o data/readings/manual_20260320/readings.csv
+        """
+    )
+    parser.add_argument('json_dir', help='Directory containing bicorder JSON reading files')
+    parser.add_argument('-o', '--output', default=None,
+                        help='Output CSV path (default: <dataset_dir>/readings.csv)')
+    parser.add_argument('-b', '--bicorder', default='../bicorder.json',
+                        help='Path to bicorder.json (default: ../bicorder.json)')
+    parser.add_argument('--shortform-only', action='store_true',
+                        help='Include only the key shortform dimensions (useful when most readings are shortform)')
+    args = parser.parse_args()
+
+    json_dir = Path(args.json_dir)
+    dataset_dir = json_dir.parent
+    output_path = Path(args.output) if args.output else dataset_dir / 'readings.csv'
+
+    all_columns, key_columns = load_bicorder_columns(args.bicorder)
+
+    if args.shortform_only:
+        columns = [c for c in all_columns if c in key_columns]
+        print(f"Shortform mode: using {len(columns)} key dimensions")
+    else:
+        columns = all_columns
+
+    json_files = sorted(json_dir.glob('*.json'))
+    if not json_files:
+        print(f"Error: no JSON files found in {json_dir}")
+        return
+
+    print(f"Converting {len(json_files)} JSON files → {output_path}")
+
+    fieldnames = ['Descriptor', 'Description', 'analyst', 'standpoint',
+                  'timestamp', 'shortform', 'version'] + columns
+
+    rows = []
+    for json_path in json_files:
+        try:
+            row = json_to_row(json_path, columns)
+            rows.append(row)
+        except Exception as e:
+            print(f"  Warning: skipping {json_path.name}: {e}")
+
+    with open(output_path, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+
+    # Summary stats
+    filled = {col: sum(1 for r in rows if r.get(col)) for col in columns}
+    print(f"Done. {len(rows)} rows written.")
+    print(f"\nDimension coverage (readings with a value):")
+    for col, count in filled.items():
+        pct = count / len(rows) * 100 if rows else 0
+        marker = '* ' if col in key_columns else '  '
+        print(f"  {marker}{col}: {count}/{len(rows)} ({pct:.0f}%)")
+    print(f"\n(* = shortform/key dimension)")
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""
+Create LDA visualization to maximize cluster separation.
+
+Usage:
+  python3 scripts/lda_visualization.py data/readings/synthetic_20251116.csv
+  python3 scripts/lda_visualization.py data/readings/synthetic_20251116.csv --results-dir analysis_results/synthetic_20251116
+"""
+
+import argparse
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.preprocessing import StandardScaler
+from pathlib import Path
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Create LDA visualization of cluster separation',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Example usage:
+  python3 scripts/lda_visualization.py data/readings/synthetic_20251116/readings.csv
+  python3 scripts/lda_visualization.py data/readings/manual_20260101/readings.csv --analysis-dir data/readings/manual_20260101/analysis
+        """
+    )
+    parser.add_argument('input_csv', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
+    parser.add_argument('--analysis-dir', default=None,
+                        help='Analysis directory (default: <dataset_dir>/analysis)')
+    args = parser.parse_args()
+
+    dataset_dir = Path(args.input_csv).parent
+    results_dir = Path(args.analysis_dir) if args.analysis_dir else dataset_dir / 'analysis'
+    plots_dir = results_dir / 'plots'
+    data_dir = results_dir / 'data'
+
+    # Load the original data
+    df = pd.read_csv(args.input_csv)
+
+    # Identify dimension columns
+    all_cols = df.columns.tolist()
+    design_cols = [c for c in all_cols if c.startswith('Design_')]
+    entanglement_cols = [c for c in all_cols if c.startswith('Entanglement_')]
+    experience_cols = [c for c in all_cols if c.startswith('Experience_')]
+    dimension_cols = design_cols + entanglement_cols + experience_cols
+
+    # Load cluster assignments
+    clusters = pd.read_csv(data_dir / 'kmeans_clusters.csv')
+    df_with_clusters = df.merge(clusters, on='Descriptor')
+
+    # Drop dimension columns with low coverage (< 80%) to handle shortform datasets
+    n = len(df_with_clusters)
+    coverage = df_with_clusters[dimension_cols].notna().sum() / n
+    dimension_cols = [c for c in dimension_cols if coverage[c] >= 0.8]
+
+    # Prepare data — impute any remaining NaNs with column median
+    X_df = df_with_clusters[dimension_cols].copy()
+    X_df = X_df.fillna(X_df.median())
+    X = X_df.values
+    y = df_with_clusters['cluster'].values
+
+    # Standardize
+    scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(X)
+
+    # Fit LDA (with 1 component for 2 classes)
+    lda = LinearDiscriminantAnalysis(n_components=1)
+    X_lda = lda.fit_transform(X_scaled, y).ravel()
+
+    # Create histogram showing separation
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))
+
+    # Histogram
+    colors = {1: '#2E86AB', 2: '#A23B72'}
+    for cluster_id in [1, 2]:
+        cluster_data = X_lda[y == cluster_id]
+        ax1.hist(cluster_data, bins=30, alpha=0.6,
+                 color=colors[cluster_id],
+                 label=f'Cluster {cluster_id}',
+                 edgecolor='white', linewidth=0.5)
+
+    ax1.set_xlabel('Linear Discriminant (LD1)', fontsize=12)
+    ax1.set_ylabel('Frequency', fontsize=12)
+    ax1.set_title('Linear Discriminant Analysis: Cluster Separation\n(Maximum separation projection)',
+                  fontsize=14, fontweight='bold')
+    ax1.legend(fontsize=11)
+    ax1.grid(True, alpha=0.3, axis='y')
+
+    # Strip plot - shows individual protocols
+    for cluster_id in [1, 2]:
+        cluster_data = X_lda[y == cluster_id]
+        cluster_protocols = df_with_clusters[df_with_clusters['cluster'] == cluster_id]['Descriptor'].values
+
+        # Add jitter for visibility
+        y_jitter = np.random.normal(cluster_id, 0.1, size=len(cluster_data))
+
+        ax2.scatter(cluster_data, y_jitter,
+                   c=colors[cluster_id], alpha=0.5, s=40,
+                   edgecolors='white', linewidth=0.3)
+
+        # Label a few representative protocols
+        for i in range(0, len(cluster_data), 25):
+            ax2.annotate(cluster_protocols[i],
+                        (cluster_data[i], y_jitter[i]),
+                        fontsize=7, alpha=0.7,
+                        xytext=(0, 5), textcoords='offset points',
+                        rotation=45, ha='left')
+
+    ax2.set_xlabel('Linear Discriminant (LD1)', fontsize=12)
+    ax2.set_ylabel('Cluster', fontsize=12)
+    ax2.set_yticks([1, 2])
+    ax2.set_yticklabels(['Cluster 1:\nRelational/Cultural', 'Cluster 2:\nInstitutional/Bureaucratic'])
+    ax2.set_title('Individual Protocols Projected onto Discriminant Axis', fontsize=12)
+    ax2.grid(True, alpha=0.3, axis='x')
+
+    plt.tight_layout()
+    plt.savefig(plots_dir / 'lda_cluster_separation.png', dpi=300, bbox_inches='tight')
+    print(f"Saved: {plots_dir / 'lda_cluster_separation.png'}")
+
+    # Calculate separation metrics
+    mean_1 = X_lda[y == 1].mean()
+    mean_2 = X_lda[y == 2].mean()
+    std_1 = X_lda[y == 1].std()
+    std_2 = X_lda[y == 2].std()
+
+    # Cohen's d (effect size)
+    pooled_std = np.sqrt((std_1**2 + std_2**2) / 2)
+    cohens_d = abs(mean_1 - mean_2) / pooled_std
+
+    print(f"\n=== Cluster Separation Statistics ===")
+    mean_1_val = mean_1[0] if isinstance(mean_1, np.ndarray) else mean_1
+    mean_2_val = mean_2[0] if isinstance(mean_2, np.ndarray) else mean_2
+    cohens_d_val = cohens_d[0] if isinstance(cohens_d, np.ndarray) else cohens_d
+    print(f"Cluster 1 mean: {mean_1_val:.3f} (std: {std_1:.3f})")
+    print(f"Cluster 2 mean: {mean_2_val:.3f} (std: {std_2:.3f})")
+    print(f"Distance between means: {abs(mean_1_val - mean_2_val):.3f}")
+    print(f"Cohen's d (effect size): {cohens_d_val:.3f}")
+    print(f"  (0.2=small, 0.5=medium, 0.8=large effect)")
+
+    # Overlap percentage (rough estimate)
+    overlap_start = max(X_lda[y == 1].min(), X_lda[y == 2].min())
+    overlap_end = min(X_lda[y == 1].max(), X_lda[y == 2].max())
+    overlap_range = overlap_end - overlap_start if overlap_end > overlap_start else 0
+    total_range = X_lda.max() - X_lda.min()
+    overlap_pct = (overlap_range / total_range) * 100 if overlap_range > 0 else 0
+
+    print(f"Approximate overlap: {overlap_pct:.1f}% of total range")
+
+    # Save LDA projection data
+    lda_df = pd.DataFrame({
+        'Descriptor': df_with_clusters['Descriptor'],
+        'LD1': X_lda.flatten(),
+        'Cluster': y
+    })
+    lda_df.to_csv(data_dir / 'lda_projection.csv', index=False)
+    print(f"Saved: {data_dir / 'lda_projection.csv'}")
+
+    print("\n=== Most discriminating dimensions ===")
+    loadings = pd.DataFrame({
+        'Dimension': dimension_cols,
+        'LDA_Coefficient': lda.coef_[0]
+    })
+    loadings['Abs_Coefficient'] = loadings['LDA_Coefficient'].abs()
+    loadings = loadings.sort_values('Abs_Coefficient', ascending=False)
+
+    print("\nTop 10 dimensions that separate the clusters:")
+    for _, row in loadings.head(10).iterrows():
+        print(f"  {row['Dimension']}: {row['LDA_Coefficient']:.3f}")
+
+    loadings.to_csv(data_dir / 'lda_coefficients.csv', index=False)
+    print(f"\nSaved: {data_dir / 'lda_coefficients.csv'}")
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,858 @@
+#!/usr/bin/env python3
+"""
+Multivariate Analysis Script for Protocol Bicorder Data
+
+Performs comprehensive multivariate statistical analyses on protocol diagnostic data,
+including clustering, dimensionality reduction, correlation analysis, and visualization.
+
+Usage:
+    python3 multivariate_analysis.py diagnostic_output.csv [--analyses all]
+    python3 multivariate_analysis.py diagnostic_output.csv --analyses clustering pca
+"""
+
+import argparse
+import sys
+from pathlib import Path
+import warnings
+warnings.filterwarnings('ignore')
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from scipy import stats
+from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
+from scipy.spatial.distance import pdist, squareform
+import networkx as nx
+
+from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA, FactorAnalysis
+from sklearn.manifold import TSNE
+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.metrics import silhouette_score, davies_bouldin_score
+
+try:
+    import umap
+    UMAP_AVAILABLE = True
+except ImportError:
+    UMAP_AVAILABLE = False
+    print("Note: UMAP not available. Install with: pip install umap-learn")
+
+
+class ProtocolAnalyzer:
+    """Main class for multivariate analysis of protocol data."""
+
+    def __init__(self, csv_path, output_dir='analysis_results', min_coverage=0.0):
+        """Initialize analyzer with data and output directory.
+
+        Args:
+            csv_path: Path to diagnostic CSV file
+            output_dir: Directory for analysis output
+            min_coverage: Drop dimension columns with fewer than this fraction of
+                          non-null values (0.0–1.0). Useful for sparse/shortform
+                          datasets. E.g. 0.8 keeps only columns with ≥80% coverage.
+        """
+        self.csv_path = Path(csv_path)
+        self.output_dir = Path(output_dir)
+        self.min_coverage = min_coverage
+        self.output_dir.mkdir(exist_ok=True)
+
+        # Create subdirectories
+        (self.output_dir / 'plots').mkdir(exist_ok=True)
+        (self.output_dir / 'data').mkdir(exist_ok=True)
+        (self.output_dir / 'reports').mkdir(exist_ok=True)
+
+        # Load and prepare data
+        self.df = None
+        self.dimension_cols = []
+        self.design_cols = []
+        self.entanglement_cols = []
+        self.experience_cols = []
+        self.scaled_data = None
+        self.scaler = None
+
+        self._load_data()
+
+    def _load_data(self):
+        """Load CSV and identify dimension columns."""
+        print(f"Loading data from {self.csv_path}...")
+        self.df = pd.read_csv(self.csv_path)
+
+        # Identify dimension columns
+        all_cols = self.df.columns.tolist()
+        self.design_cols = [c for c in all_cols if c.startswith('Design_')]
+        self.entanglement_cols = [c for c in all_cols if c.startswith('Entanglement_')]
+        self.experience_cols = [c for c in all_cols if c.startswith('Experience_')]
+        self.dimension_cols = self.design_cols + self.entanglement_cols + self.experience_cols
+
+        print(f"Loaded {len(self.df)} protocols with {len(self.dimension_cols)} dimensions")
+        print(f"  - Design: {len(self.design_cols)}")
+        print(f"  - Entanglement: {len(self.entanglement_cols)}")
+        print(f"  - Experience: {len(self.experience_cols)}")
+
+        # Drop low-coverage columns if min_coverage is set
+        if self.min_coverage > 0.0:
+            n = len(self.df)
+            coverage = self.df[self.dimension_cols].notna().sum() / n
+            dropped = [c for c in self.dimension_cols if coverage[c] < self.min_coverage]
+            if dropped:
+                print(f"\nDropping {len(dropped)} dimension(s) below {self.min_coverage:.0%} coverage:")
+                for c in dropped:
+                    print(f"  - {c}: {coverage[c]:.0%}")
+                self.dimension_cols = [c for c in self.dimension_cols if c not in dropped]
+                self.design_cols = [c for c in self.design_cols if c not in dropped]
+                self.entanglement_cols = [c for c in self.entanglement_cols if c not in dropped]
+                self.experience_cols = [c for c in self.experience_cols if c not in dropped]
+                print(f"Remaining dimensions: {len(self.dimension_cols)}")
+
+        # Check for missing values
+        missing_count = self.df[self.dimension_cols].isna().sum().sum()
+        rows_with_missing = self.df[self.dimension_cols].isna().any(axis=1).sum()
+
+        if missing_count > 0:
+            print(f"\nWarning: Found {missing_count} missing values in {rows_with_missing} rows")
+            print("Dropping rows with missing values...")
+            self.df = self.df.dropna(subset=self.dimension_cols)
+            print(f"Dataset now contains {len(self.df)} protocols")
+
+        # Standardize the dimension data
+        self.scaler = StandardScaler()
+        self.scaled_data = self.scaler.fit_transform(self.df[self.dimension_cols])
+
+    def save_results(self, data, filename, subdir='data'):
+        """Save results to CSV file."""
+        output_path = self.output_dir / subdir / filename
+        if isinstance(data, pd.DataFrame):
+            data.to_csv(output_path, index=False)
+        else:
+            pd.DataFrame(data).to_csv(output_path)
+        print(f"  Saved: {output_path}")
+
+    def save_plot(self, filename):
+        """Save current matplotlib figure."""
+        output_path = self.output_dir / 'plots' / filename
+        plt.tight_layout()
+        plt.savefig(output_path, dpi=300, bbox_inches='tight')
+        print(f"  Saved: {output_path}")
+        plt.close()
+
+    # ========== CLUSTERING ANALYSES ==========
+
+    def kmeans_clustering(self, n_clusters_range=(2, 10)):
+        """Perform K-means clustering with elbow method."""
+        print("\n=== K-Means Clustering ===")
+
+        # Elbow method
+        inertias = []
+        silhouettes = []
+        k_range = range(n_clusters_range[0], n_clusters_range[1] + 1)
+
+        for k in k_range:
+            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
+            labels = kmeans.fit_predict(self.scaled_data)
+            inertias.append(kmeans.inertia_)
+            if k > 1:
+                silhouettes.append(silhouette_score(self.scaled_data, labels))
+            else:
+                silhouettes.append(0)
+
+        # Plot elbow curve
+        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
+
+        ax1.plot(k_range, inertias, 'bo-')
+        ax1.set_xlabel('Number of Clusters (k)')
+        ax1.set_ylabel('Inertia')
+        ax1.set_title('Elbow Method for Optimal k')
+        ax1.grid(True, alpha=0.3)
+
+        ax2.plot(k_range, silhouettes, 'ro-')
+        ax2.set_xlabel('Number of Clusters (k)')
+        ax2.set_ylabel('Silhouette Score')
+        ax2.set_title('Silhouette Score by k')
+        ax2.grid(True, alpha=0.3)
+
+        self.save_plot('kmeans_elbow.png')
+
+        # Use optimal k (highest silhouette)
+        optimal_k = k_range[np.argmax(silhouettes)]
+        print(f"Optimal k by silhouette score: {optimal_k}")
+
+        # Final clustering
+        kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
+        self.df['kmeans_cluster'] = kmeans.fit_predict(self.scaled_data)
+
+        # Save results
+        results = self.df[['Descriptor', 'kmeans_cluster']].copy()
+        results['cluster'] = results['kmeans_cluster'] + 1  # 1-indexed for readability
+        self.save_results(results[['Descriptor', 'cluster']], 'kmeans_clusters.csv')
+
+        # Cluster statistics
+        print(f"\nCluster sizes:")
+        print(self.df['kmeans_cluster'].value_counts().sort_index())
+
+        return self.df['kmeans_cluster']
+
+    def hierarchical_clustering(self, n_clusters=5, method='ward'):
+        """Perform hierarchical clustering with dendrogram."""
+        print("\n=== Hierarchical Clustering ===")
+
+        # Compute linkage
+        Z = linkage(self.scaled_data, method=method)
+
+        # Plot dendrogram
+        plt.figure(figsize=(16, 8))
+        dendrogram(Z, labels=self.df['Descriptor'].values, leaf_font_size=8)
+        plt.title(f'Hierarchical Clustering Dendrogram ({method} linkage)')
+        plt.xlabel('Protocol')
+        plt.ylabel('Distance')
+        plt.xticks(rotation=90)
+        self.save_plot('hierarchical_dendrogram.png')
+
+        # Cut tree to get clusters
+        self.df['hierarchical_cluster'] = fcluster(Z, n_clusters, criterion='maxclust')
+
+        # Save results
+        results = self.df[['Descriptor', 'hierarchical_cluster']].copy()
+        results.columns = ['Descriptor', 'cluster']
+        self.save_results(results, 'hierarchical_clusters.csv')
+
+        print(f"\nCluster sizes:")
+        print(self.df['hierarchical_cluster'].value_counts().sort_index())
+
+        return self.df['hierarchical_cluster']
+
+    def dbscan_clustering(self, eps=3.0, min_samples=3):
+        """Perform DBSCAN clustering to identify outliers."""
+        print("\n=== DBSCAN Clustering ===")
+
+        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
+        self.df['dbscan_cluster'] = dbscan.fit_predict(self.scaled_data)
+
+        n_clusters = len(set(self.df['dbscan_cluster'])) - (1 if -1 in self.df['dbscan_cluster'] else 0)
+        n_outliers = (self.df['dbscan_cluster'] == -1).sum()
+
+        print(f"Found {n_clusters} clusters and {n_outliers} outliers")
+
+        # Save results
+        results = self.df[['Descriptor', 'dbscan_cluster']].copy()
+        results.columns = ['Descriptor', 'cluster']
+        self.save_results(results, 'dbscan_clusters.csv')
+
+        if n_outliers > 0:
+            outliers = self.df[self.df['dbscan_cluster'] == -1][['Descriptor']]
+            self.save_results(outliers, 'dbscan_outliers.csv')
+            print("\nOutlier protocols:")
+            for protocol in outliers['Descriptor']:
+                print(f"  - {protocol}")
+
+        return self.df['dbscan_cluster']
+
+    # ========== DIMENSIONALITY REDUCTION ==========
+
+    def pca_analysis(self, n_components=None):
+        """Perform PCA and visualize results."""
+        print("\n=== Principal Component Analysis ===")
+
+        # Fit PCA
+        if n_components is None:
+            pca = PCA()
+        else:
+            pca = PCA(n_components=n_components)
+
+        pca_coords = pca.fit_transform(self.scaled_data)
+
+        # Explained variance
+        explained_var = pca.explained_variance_ratio_
+        cumsum_var = np.cumsum(explained_var)
+
+        print(f"First 5 PCs explain {cumsum_var[4]*100:.1f}% of variance")
+
+        # Plot explained variance
+        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
+
+        n_show = min(15, len(explained_var))
+        ax1.bar(range(1, n_show + 1), explained_var[:n_show])
+        ax1.set_xlabel('Principal Component')
+        ax1.set_ylabel('Explained Variance Ratio')
+        ax1.set_title('Variance Explained by Each PC')
+        ax1.grid(True, alpha=0.3, axis='y')
+
+        ax2.plot(range(1, n_show + 1), cumsum_var[:n_show], 'o-')
+        ax2.axhline(y=0.8, color='r', linestyle='--', alpha=0.5, label='80% threshold')
+        ax2.set_xlabel('Number of Components')
+        ax2.set_ylabel('Cumulative Explained Variance')
+        ax2.set_title('Cumulative Variance Explained')
+        ax2.legend()
+        ax2.grid(True, alpha=0.3)
+
+        self.save_plot('pca_variance.png')
+
+        # 2D visualization
+        plt.figure(figsize=(12, 10))
+        plt.scatter(pca_coords[:, 0], pca_coords[:, 1], alpha=0.6, s=50)
+
+        # Annotate points
+        for i, protocol in enumerate(self.df['Descriptor']):
+            if i % 3 == 0:  # Label every 3rd point to avoid clutter
+                plt.annotate(protocol, (pca_coords[i, 0], pca_coords[i, 1]),
+                           fontsize=6, alpha=0.7)
+
+        plt.xlabel(f'PC1 ({explained_var[0]*100:.1f}% variance)')
+        plt.ylabel(f'PC2 ({explained_var[1]*100:.1f}% variance)')
+        plt.title('Protocols in PCA Space (First 2 Components)')
+        plt.grid(True, alpha=0.3)
+        self.save_plot('pca_2d.png')
+
+        # Save PCA coordinates
+        pca_df = pd.DataFrame(pca_coords[:, :5],
+                             columns=[f'PC{i+1}' for i in range(min(5, pca_coords.shape[1]))])
+        pca_df.insert(0, 'Descriptor', self.df['Descriptor'])
+        self.save_results(pca_df, 'pca_coordinates.csv')
+
+        # Component loadings
+        loadings = pd.DataFrame(
+            pca.components_[:5, :].T,
+            columns=[f'PC{i+1}' for i in range(min(5, pca.components_.shape[0]))],
+            index=self.dimension_cols
+        )
+        self.save_results(loadings, 'pca_loadings.csv')
+
+        # Plot loadings heatmap
+        plt.figure(figsize=(10, 12))
+        sns.heatmap(loadings, cmap='RdBu_r', center=0, cbar_kws={'label': 'Loading'})
+        plt.title('PCA Component Loadings')
+        plt.tight_layout()
+        self.save_plot('pca_loadings_heatmap.png')
+
+        return pca_coords, pca
+
+    def tsne_analysis(self, perplexity=30, n_components=2):
+        """Perform t-SNE analysis."""
+        print("\n=== t-SNE Analysis ===")
+
+        tsne = TSNE(n_components=n_components, perplexity=perplexity, random_state=42, max_iter=1000)
+        tsne_coords = tsne.fit_transform(self.scaled_data)
+
+        # Plot
+        plt.figure(figsize=(12, 10))
+        plt.scatter(tsne_coords[:, 0], tsne_coords[:, 1], alpha=0.6, s=50)
+
+        # Annotate some points
+        for i, protocol in enumerate(self.df['Descriptor']):
+            if i % 4 == 0:  # Label every 4th point
+                plt.annotate(protocol, (tsne_coords[i, 0], tsne_coords[i, 1]),
+                           fontsize=6, alpha=0.7)
+
+        plt.xlabel('t-SNE Dimension 1')
+        plt.ylabel('t-SNE Dimension 2')
+        plt.title(f't-SNE Projection (perplexity={perplexity})')
+        plt.grid(True, alpha=0.3)
+        self.save_plot('tsne_2d.png')
+
+        # Save coordinates
+        tsne_df = pd.DataFrame(tsne_coords, columns=['TSNE1', 'TSNE2'])
+        tsne_df.insert(0, 'Descriptor', self.df['Descriptor'])
+        self.save_results(tsne_df, 'tsne_coordinates.csv')
+
+        return tsne_coords
+
+    def umap_analysis(self, n_neighbors=15, min_dist=0.1, n_components=2):
+        """Perform UMAP analysis if available."""
+        if not UMAP_AVAILABLE:
+            print("\n=== UMAP Analysis ===")
+            print("UMAP not available. Install with: pip install umap-learn")
+            return None
+
+        print("\n=== UMAP Analysis ===")
+
+        reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist,
+                           n_components=n_components, random_state=42)
+        umap_coords = reducer.fit_transform(self.scaled_data)
+
+        # Plot
+        plt.figure(figsize=(12, 10))
+        plt.scatter(umap_coords[:, 0], umap_coords[:, 1], alpha=0.6, s=50)
+
+        # Annotate some points
+        for i, protocol in enumerate(self.df['Descriptor']):
+            if i % 4 == 0:
+                plt.annotate(protocol, (umap_coords[i, 0], umap_coords[i, 1]),
+                           fontsize=6, alpha=0.7)
+
+        plt.xlabel('UMAP Dimension 1')
+        plt.ylabel('UMAP Dimension 2')
+        plt.title(f'UMAP Projection (n_neighbors={n_neighbors}, min_dist={min_dist})')
+        plt.grid(True, alpha=0.3)
+        self.save_plot('umap_2d.png')
+
+        # Save coordinates
+        umap_df = pd.DataFrame(umap_coords, columns=['UMAP1', 'UMAP2'])
+        umap_df.insert(0, 'Descriptor', self.df['Descriptor'])
+        self.save_results(umap_df, 'umap_coordinates.csv')
+
+        return umap_coords
+
+    def factor_analysis(self, n_factors=5):
+        """Perform factor analysis."""
+        print("\n=== Factor Analysis ===")
+
+        fa = FactorAnalysis(n_components=n_factors, random_state=42)
+        fa_coords = fa.fit_transform(self.scaled_data)
+
+        # Factor loadings
+        loadings = pd.DataFrame(
+            fa.components_.T,
+            columns=[f'Factor{i+1}' for i in range(n_factors)],
+            index=self.dimension_cols
+        )
+        self.save_results(loadings, 'factor_loadings.csv')
+
+        # Plot loadings heatmap
+        plt.figure(figsize=(10, 12))
+        sns.heatmap(loadings, cmap='RdBu_r', center=0, cbar_kws={'label': 'Loading'})
+        plt.title('Factor Analysis Loadings')
+        plt.tight_layout()
+        self.save_plot('factor_loadings_heatmap.png')
+
+        # Save factor scores
+        fa_df = pd.DataFrame(fa_coords,
+                            columns=[f'Factor{i+1}' for i in range(n_factors)])
+        fa_df.insert(0, 'Descriptor', self.df['Descriptor'])
+        self.save_results(fa_df, 'factor_scores.csv')
+
+        return fa_coords, fa
+
+    # ========== CORRELATION & STRUCTURE ==========
+
+    def correlation_analysis(self):
+        """Compute and visualize correlation matrices."""
+        print("\n=== Correlation Analysis ===")
+
+        # Full correlation matrix
+        corr_matrix = self.df[self.dimension_cols].corr()
+
+        # Plot full correlation heatmap
+        plt.figure(figsize=(16, 14))
+        sns.heatmap(corr_matrix, cmap='RdBu_r', center=0,
+                   square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'})
+        plt.title('Correlation Matrix - All Dimensions')
+        plt.tight_layout()
+        self.save_plot('correlation_heatmap_full.png')
+
+        # Save correlation matrix
+        self.save_results(corr_matrix, 'correlation_matrix.csv')
+
+        # Find strongest correlations
+        corr_pairs = []
+        for i in range(len(corr_matrix.columns)):
+            for j in range(i+1, len(corr_matrix.columns)):
+                corr_pairs.append({
+                    'Dimension1': corr_matrix.columns[i],
+                    'Dimension2': corr_matrix.columns[j],
+                    'Correlation': corr_matrix.iloc[i, j]
+                })
+
+        corr_df = pd.DataFrame(corr_pairs).sort_values('Correlation',
+                                                       key=abs,
+                                                       ascending=False)
+        self.save_results(corr_df.head(20), 'top_correlations.csv')
+
+        print("\nTop 5 positive correlations:")
+        for _, row in corr_df.head(5).iterrows():
+            print(f"  {row['Dimension1']} <-> {row['Dimension2']}: {row['Correlation']:.3f}")
+
+        print("\nTop 5 negative correlations:")
+        for _, row in corr_df.tail(5).iterrows():
+            print(f"  {row['Dimension1']} <-> {row['Dimension2']}: {row['Correlation']:.3f}")
+
+        # Within-category correlations
+        self._plot_category_correlation('Design', self.design_cols)
+        self._plot_category_correlation('Entanglement', self.entanglement_cols)
+        self._plot_category_correlation('Experience', self.experience_cols)
+
+        return corr_matrix
+
+    def _plot_category_correlation(self, category_name, columns):
+        """Plot correlation heatmap for a specific category."""
+        corr = self.df[columns].corr()
+
+        plt.figure(figsize=(10, 8))
+        sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r', center=0,
+                   square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'})
+        plt.title(f'{category_name} Dimensions - Correlation Matrix')
+        plt.tight_layout()
+        self.save_plot(f'correlation_heatmap_{category_name.lower()}.png')
+
+    def network_analysis(self, threshold=0.5):
+        """Create network graph of protocol similarities."""
+        print("\n=== Network Analysis ===")
+
+        # Compute pairwise distances
+        distances = pdist(self.scaled_data, metric='euclidean')
+        dist_matrix = squareform(distances)
+
+        # Convert to similarity (inverse of distance, normalized)
+        max_dist = dist_matrix.max()
+        similarity_matrix = 1 - (dist_matrix / max_dist)
+
+        # Create network
+        G = nx.Graph()
+
+        # Add nodes
+        for i, protocol in enumerate(self.df['Descriptor']):
+            G.add_node(i, label=protocol)
+
+        # Add edges above threshold
+        edge_count = 0
+        for i in range(len(similarity_matrix)):
+            for j in range(i+1, len(similarity_matrix)):
+                if similarity_matrix[i, j] > threshold:
+                    G.add_edge(i, j, weight=similarity_matrix[i, j])
+                    edge_count += 1
+
+        print(f"Network with {G.number_of_nodes()} nodes and {edge_count} edges")
+
+        # Calculate network metrics
+        if G.number_of_edges() > 0:
+            degree_centrality = nx.degree_centrality(G)
+            betweenness = nx.betweenness_centrality(G)
+
+            metrics_df = pd.DataFrame({
+                'Descriptor': [self.df.iloc[i]['Descriptor'] for i in G.nodes()],
+                'Degree_Centrality': [degree_centrality[i] for i in G.nodes()],
+                'Betweenness_Centrality': [betweenness[i] for i in G.nodes()]
+            }).sort_values('Degree_Centrality', ascending=False)
+
+            self.save_results(metrics_df, 'network_metrics.csv')
+
+            print("\nTop 5 most central protocols:")
+            for _, row in metrics_df.head(5).iterrows():
+                print(f"  {row['Descriptor']}: {row['Degree_Centrality']:.3f}")
+
+            # Plot network
+            plt.figure(figsize=(16, 16))
+            pos = nx.spring_layout(G, k=0.5, iterations=50, seed=42)
+
+            # Node sizes based on degree centrality
+            node_sizes = [degree_centrality[i] * 3000 + 100 for i in G.nodes()]
+
+            nx.draw_networkx_nodes(G, pos, node_size=node_sizes,
+                                  node_color='lightblue', alpha=0.7)
+            nx.draw_networkx_edges(G, pos, alpha=0.2)
+
+            # Labels for high-centrality nodes
+            high_centrality = {i: self.df.iloc[i]['Descriptor']
+                             for i in G.nodes() if degree_centrality[i] > 0.1}
+            nx.draw_networkx_labels(G, pos, labels=high_centrality, font_size=8)
+
+            plt.title(f'Protocol Similarity Network (threshold={threshold})')
+            plt.axis('off')
+            plt.tight_layout()
+            self.save_plot('network_graph.png')
+        else:
+            print("No edges above threshold - try lowering the threshold")
+
+        return G
+
+    # ========== CLASSIFICATION & PREDICTION ==========
+
+    def category_discriminant_analysis(self):
+        """Analyze how well dimension categories discriminate protocols."""
+        print("\n=== Category Discriminant Analysis ===")
+
+        results = []
+
+        for category_name, columns in [('Design', self.design_cols),
+                                       ('Entanglement', self.entanglement_cols),
+                                       ('Experience', self.experience_cols)]:
+
+            # Use one category to predict clustering from another
+            X = self.df[columns].values
+
+            # Use kmeans clusters as target if available
+            if 'kmeans_cluster' in self.df.columns:
+                y = self.df['kmeans_cluster'].values
+
+                # LDA
+                try:
+                    lda = LinearDiscriminantAnalysis()
+                    lda.fit(X, y)
+                    score = lda.score(X, y)
+
+                    results.append({
+                        'Category': category_name,
+                        'Accuracy': score,
+                        'N_Dimensions': len(columns)
+                    })
+
+                    print(f"{category_name} dimensions predict clusters with {score*100:.1f}% accuracy")
+                except:
+                    print(f"Could not perform LDA for {category_name}")
+
+        if results:
+            results_df = pd.DataFrame(results)
+            self.save_results(results_df, 'category_discriminant_results.csv')
+
+        return results
+
+    def feature_importance_analysis(self):
+        """Analyze which dimensions are most important for clustering."""
+        print("\n=== Feature Importance Analysis ===")
+
+        if 'kmeans_cluster' not in self.df.columns:
+            print("Run clustering first to enable feature importance analysis")
+            return None
+
+        # Random Forest classifier
+        X = self.df[self.dimension_cols].values
+        y = self.df['kmeans_cluster'].values
+
+        rf = RandomForestClassifier(n_estimators=100, random_state=42)
+        rf.fit(X, y)
+
+        # Feature importances
+        importances = pd.DataFrame({
+            'Dimension': self.dimension_cols,
+            'Importance': rf.feature_importances_
+        }).sort_values('Importance', ascending=False)
+
+        self.save_results(importances, 'feature_importances.csv')
+
+        # Plot top 20
+        plt.figure(figsize=(10, 12))
+        top_20 = importances.head(20)
+        plt.barh(range(len(top_20)), top_20['Importance'])
+        plt.yticks(range(len(top_20)), top_20['Dimension'])
+        plt.xlabel('Importance')
+        plt.title('Top 20 Most Important Dimensions for Clustering')
+        plt.gca().invert_yaxis()
+        plt.tight_layout()
+        self.save_plot('feature_importances.png')
+
+        print("\nTop 10 most important dimensions:")
+        for _, row in importances.head(10).iterrows():
+            print(f"  {row['Dimension']}: {row['Importance']:.4f}")
+
+        return importances
+
+    def analyst_comparison(self):
+        """Compare ratings across different analysts."""
+        print("\n=== Analyst Comparison ===")
+
+        if 'analyst' not in self.df.columns:
+            print("No analyst column found")
+            return None
+
+        analysts = self.df['analyst'].unique()
+        print(f"Found {len(analysts)} unique analysts")
+
+        # Mean ratings by analyst for each dimension
+        analyst_means = self.df.groupby('analyst')[self.dimension_cols].mean()
+        self.save_results(analyst_means, 'analyst_mean_ratings.csv')
+
+        # Plot comparison
+        fig, axes = plt.subplots(3, 1, figsize=(14, 12))
+
+        for idx, (category_name, columns) in enumerate([
+            ('Design', self.design_cols),
+            ('Entanglement', self.entanglement_cols),
+            ('Experience', self.experience_cols)
+        ]):
+            analyst_means[columns].T.plot(ax=axes[idx], marker='o')
+            axes[idx].set_title(f'{category_name} Dimensions - Mean Ratings by Analyst')
+            axes[idx].set_ylabel('Mean Rating')
+            axes[idx].legend(title='Analyst', bbox_to_anchor=(1.05, 1), loc='upper left')
+            axes[idx].grid(True, alpha=0.3)
+            axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=45, ha='right')
+
+        plt.tight_layout()
+        self.save_plot('analyst_comparison.png')
+
+        return analyst_means
+
+    # ========== SUMMARY REPORT ==========
+
+    def generate_summary_report(self):
+        """Generate a text summary of all analyses."""
+        print("\n=== Generating Summary Report ===")
+
+        report_lines = []
+        report_lines.append("=" * 80)
+        report_lines.append("MULTIVARIATE ANALYSIS SUMMARY REPORT")
+        report_lines.append("Protocol Bicorder Dataset")
+        report_lines.append("=" * 80)
+        report_lines.append("")
+
+        report_lines.append(f"Dataset: {self.csv_path}")
+        report_lines.append(f"Number of protocols: {len(self.df)}")
+        report_lines.append(f"Number of dimensions: {len(self.dimension_cols)}")
+        report_lines.append(f"  - Design: {len(self.design_cols)}")
+        report_lines.append(f"  - Entanglement: {len(self.entanglement_cols)}")
+        report_lines.append(f"  - Experience: {len(self.experience_cols)}")
+        report_lines.append("")
+
+        report_lines.append("-" * 80)
+        report_lines.append("ANALYSES PERFORMED")
+        report_lines.append("-" * 80)
+
+        # Check which analyses were run
+        analyses_run = []
+
+        if 'kmeans_cluster' in self.df.columns:
+            analyses_run.append("- K-Means Clustering")
+            report_lines.append(f"K-Means: {len(self.df['kmeans_cluster'].unique())} clusters identified")
+
+        if 'hierarchical_cluster' in self.df.columns:
+            analyses_run.append("- Hierarchical Clustering")
+            report_lines.append(f"Hierarchical: {len(self.df['hierarchical_cluster'].unique())} clusters")
+
+        if 'dbscan_cluster' in self.df.columns:
+            analyses_run.append("- DBSCAN Clustering")
+            n_outliers = (self.df['dbscan_cluster'] == -1).sum()
+            report_lines.append(f"DBSCAN: {n_outliers} outlier protocols identified")
+
+        report_lines.append("")
+        report_lines.append("Dimensionality Reduction:")
+        report_lines.append("- Principal Component Analysis (PCA)")
+        report_lines.append("- t-SNE Projection")
+        if UMAP_AVAILABLE:
+            report_lines.append("- UMAP Projection")
+        report_lines.append("- Factor Analysis")
+        report_lines.append("")
+
+        report_lines.append("Statistical Analyses:")
+        report_lines.append("- Correlation Analysis")
+        report_lines.append("- Network Analysis")
+        report_lines.append("- Feature Importance Analysis")
+
+        if 'analyst' in self.df.columns:
+            report_lines.append("- Analyst Comparison")
+
+        report_lines.append("")
+        report_lines.append("-" * 80)
+        report_lines.append("OUTPUT FILES")
+        report_lines.append("-" * 80)
+        report_lines.append(f"All results saved to: {self.output_dir}/")
+        report_lines.append("  - plots/ : All visualizations (PNG)")
+        report_lines.append("  - data/ : All numerical results (CSV)")
+        report_lines.append("  - reports/ : This summary report")
+        report_lines.append("")
+
+        report_lines.append("=" * 80)
+        report_lines.append("END OF REPORT")
+        report_lines.append("=" * 80)
+
+        report_text = "\n".join(report_lines)
+
+        # Save report
+        report_path = self.output_dir / 'reports' / 'analysis_summary.txt'
+        with open(report_path, 'w') as f:
+            f.write(report_text)
+
+        print(f"  Saved: {report_path}")
+        print("\n" + report_text)
+
+        return report_text
+
+
+def main():
+    """Main execution function."""
+    parser = argparse.ArgumentParser(
+        description='Multivariate analysis of Protocol Bicorder data',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv
+  python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv --output data/readings/synthetic_20251116/analysis
+  python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv --analyses clustering pca
+        """
+    )
+
+    parser.add_argument('csv_file', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
+    parser.add_argument('--output', '-o', default=None,
+                       help='Output directory (default: <dataset_dir>/analysis)')
+    parser.add_argument('--min-coverage', type=float, default=0.0,
+                       help='Drop dimension columns below this coverage fraction (0.0–1.0). '
+                            'E.g. 0.8 keeps only columns ≥80%% complete. '
+                            'Useful for sparse/shortform datasets (default: 0.0, keep all)')
+    parser.add_argument('--analyses', nargs='+',
+                       choices=['clustering', 'pca', 'tsne', 'umap', 'factor',
+                               'correlation', 'network', 'importance', 'analyst', 'all'],
+                       default=['all'],
+                       help='Which analyses to run (default: all)')
+
+    args = parser.parse_args()
+
+    # Check if file exists
+    if not Path(args.csv_file).exists():
+        print(f"Error: File not found: {args.csv_file}")
+        sys.exit(1)
+
+    # Derive output dir from dataset dir if not specified
+    output_dir = args.output if args.output else str(Path(args.csv_file).parent / 'analysis')
+
+    # Initialize analyzer
+    print("=" * 80)
+    print("PROTOCOL BICORDER - MULTIVARIATE ANALYSIS")
+    print("=" * 80)
+
+    analyzer = ProtocolAnalyzer(args.csv_file, output_dir, min_coverage=args.min_coverage)
+
+    # Determine which analyses to run
+    run_all = 'all' in args.analyses
+
+    # Run analyses
+    try:
+        # Clustering
+        if run_all or 'clustering' in args.analyses:
+            analyzer.kmeans_clustering()
+            analyzer.hierarchical_clustering()
+            analyzer.dbscan_clustering()
+
+        # Dimensionality reduction
+        if run_all or 'pca' in args.analyses:
+            analyzer.pca_analysis()
+
+        if run_all or 'tsne' in args.analyses:
+            analyzer.tsne_analysis()
+
+        if run_all or 'umap' in args.analyses:
+            analyzer.umap_analysis()
+
+        if run_all or 'factor' in args.analyses:
+            analyzer.factor_analysis()
+
+        # Correlation and structure
+        if run_all or 'correlation' in args.analyses:
+            analyzer.correlation_analysis()
+
+        if run_all or 'network' in args.analyses:
+            analyzer.network_analysis(threshold=0.6)
+
+        # Classification
+        if run_all or 'importance' in args.analyses:
+            analyzer.category_discriminant_analysis()
+            analyzer.feature_importance_analysis()
+
+        if run_all or 'analyst' in args.analyses:
+            analyzer.analyst_comparison()
+
+        # Generate summary
+        analyzer.generate_summary_report()
+
+        print("\n" + "=" * 80)
+        print("ANALYSIS COMPLETE!")
+        print("=" * 80)
+        print(f"\nAll results saved to: {analyzer.output_dir}/")
+
+    except Exception as e:
+        print(f"\nError during analysis: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+"""
+Comprehensive review of the analysis for errors and inconsistencies.
+
+Usage:
+  python3 scripts/review_analysis.py data/readings/synthetic_20251116.csv
+  python3 scripts/review_analysis.py data/readings/manual_20260101.csv --results-dir analysis_results/manual_20260101
+"""
+
+import argparse
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Check analysis results for errors and inconsistencies',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Example usage:
+  python3 scripts/review_analysis.py data/readings/synthetic_20251116/readings.csv
+  python3 scripts/review_analysis.py data/readings/manual_20260101/readings.csv --analysis-dir data/readings/manual_20260101/analysis
+        """
+    )
+    parser.add_argument('input_csv', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
+    parser.add_argument('--analysis-dir', default=None,
+                        help='Analysis directory (default: <dataset_dir>/analysis)')
+    args = parser.parse_args()
+
+    dataset_dir = Path(args.input_csv).parent
+    results_dir = Path(args.analysis_dir) if args.analysis_dir else dataset_dir / 'analysis'
+
+    print("=" * 80)
+    print("ANALYSIS REVIEW - ERROR CHECKING")
+    print("=" * 80)
+    print(f"Dataset: {args.input_csv}")
+    print(f"Results: {results_dir}")
+
+    # Load data
+    df = pd.read_csv(args.input_csv)
+    clusters = pd.read_csv(results_dir / 'data' / 'kmeans_clusters.csv')
+    pca_coords = pd.read_csv(results_dir / 'data' / 'pca_coordinates.csv')
+
+    # Identify dimension columns
+    design_cols = [c for c in df.columns if c.startswith('Design_')]
+    entanglement_cols = [c for c in df.columns if c.startswith('Entanglement_')]
+    experience_cols = [c for c in df.columns if c.startswith('Experience_')]
+    dimension_cols = design_cols + entanglement_cols + experience_cols
+
+    errors_found = []
+    warnings_found = []
+
+    print("\n1. DATA COMPLETENESS CHECK")
+    print("-" * 80)
+
+    missing_count = df[dimension_cols].isna().sum().sum()
+    rows_with_missing = df[dimension_cols].isna().any(axis=1).sum()
+
+    print(f"✓ Total protocols in source data: {len(df)}")
+    print(f"✓ Protocols with complete data: {len(df) - rows_with_missing}")
+    print(f"✓ Protocols with missing values: {rows_with_missing}")
+    print(f"✓ Protocols in cluster analysis: {len(clusters)}")
+
+    if rows_with_missing > 0:
+        warnings_found.append(f"{rows_with_missing} protocols excluded due to missing values")
+        missing_protocols = df[df[dimension_cols].isna().any(axis=1)]['Descriptor'].tolist()
+        print(f"\n  Excluded protocols: {', '.join(missing_protocols)}")
+
+    merged = df.merge(clusters, on='Descriptor', how='inner')
+    if len(merged) != len(clusters):
+        errors_found.append(f"Descriptor mismatch: {len(merged)} matched vs {len(clusters)} expected")
+    else:
+        print(f"✓ All cluster descriptors match source data")
+
+    print("\n2. DATA QUALITY CHECK")
+    print("-" * 80)
+
+    for col in dimension_cols:
+        values = df[col].dropna()
+        if values.min() < 1 or values.max() > 9:
+            errors_found.append(f"Column {col} has out-of-range values: [{values.min()}, {values.max()}]")
+
+    print(f"✓ All dimension values within expected range [1, 9]")
+
+    df_clean = df.dropna(subset=dimension_cols)
+    variances = df_clean[dimension_cols].var()
+    low_var_dims = variances[variances < 1.0]
+    if len(low_var_dims) > 0:
+        warnings_found.append(f"{len(low_var_dims)} dimensions have very low variance (< 1.0)")
+        print(f"\n  Low variance dimensions:")
+        for dim, var in low_var_dims.items():
+            print(f"    - {dim}: {var:.3f}")
+    else:
+        print(f"✓ All dimensions have reasonable variance")
+
+    print("\n3. CLUSTERING VALIDATION")
+    print("-" * 80)
+
+    cluster_sizes = clusters['cluster'].value_counts().sort_index()
+    print(f"✓ Cluster 1: {cluster_sizes[1]} protocols ({cluster_sizes[1]/len(clusters)*100:.1f}%)")
+    print(f"✓ Cluster 2: {cluster_sizes[2]} protocols ({cluster_sizes[2]/len(clusters)*100:.1f}%)")
+
+    imbalance_ratio = max(cluster_sizes) / min(cluster_sizes)
+    if imbalance_ratio > 2.0:
+        warnings_found.append(f"Cluster imbalance ratio is {imbalance_ratio:.2f} (ideally < 2.0)")
+
+    if len(pca_coords) != len(clusters):
+        errors_found.append(f"PCA coordinates count ({len(pca_coords)}) != cluster count ({len(clusters)})")
+    else:
+        print(f"✓ PCA coordinates match cluster count")
+
+    pca_loadings = pd.read_csv(results_dir / 'data' / 'pca_loadings.csv', index_col=0)
+    if pca_loadings.shape[0] != 23:
+        errors_found.append(f"PCA loadings have {pca_loadings.shape[0]} rows, expected 23")
+    else:
+        print(f"✓ PCA loadings have correct dimensions")
+
+    print("\n4. STATISTICAL VALIDITY")
+    print("-" * 80)
+
+    corr_matrix = pd.read_csv(results_dir / 'data' / 'correlation_matrix.csv', index_col=0)
+
+    np.fill_diagonal(corr_matrix.values, 0)
+    perfect_corrs = np.where(np.abs(corr_matrix.values) > 0.99)
+    if len(perfect_corrs[0]) > 0:
+        warnings_found.append(f"Found {len(perfect_corrs[0])} near-perfect correlations between dimensions")
+    else:
+        print(f"✓ No perfect correlations found (multicollinearity check)")
+
+    try:
+        if corr_matrix.shape[0] == corr_matrix.shape[1]:
+            if not np.allclose(corr_matrix.values, corr_matrix.values.T, equal_nan=True):
+                errors_found.append("Correlation matrix is not symmetric")
+            else:
+                print(f"✓ Correlation matrix is symmetric")
+        else:
+            errors_found.append(f"Correlation matrix is not square: {corr_matrix.shape}")
+    except Exception as e:
+        warnings_found.append(f"Could not verify correlation matrix symmetry: {e}")
+
+    print("\n5. AVERAGE VALUES CHECK")
+    print("-" * 80)
+
+    df_clean = df.dropna(subset=dimension_cols)
+    calculated_averages = df_clean[dimension_cols].mean(axis=1)
+    print(f"✓ Average values range: [{calculated_averages.min():.2f}, {calculated_averages.max():.2f}]")
+    print(f"✓ Mean of averages: {calculated_averages.mean():.2f}")
+    print(f"✓ Std of averages: {calculated_averages.std():.2f}")
+
+    from scipy import stats
+    bins = np.arange(int(calculated_averages.min()), int(calculated_averages.max()) + 1, 0.5)
+    observed_counts, _ = np.histogram(calculated_averages, bins=bins)
+    expected_count = len(calculated_averages) / len(bins[:-1])
+    chi2_stat = np.sum((observed_counts - expected_count)**2 / expected_count)
+    p_value = 1 - stats.chi2.cdf(chi2_stat, len(bins) - 2)
+
+    print(f"✓ Distribution uniformity test p-value: {p_value:.4f}")
+    if p_value < 0.05:
+        print(f"  (Distribution is significantly non-uniform, as expected for real data)")
+    else:
+        warnings_found.append("Average values may be too uniformly distributed (p > 0.05)")
+
+    print("\n6. CLUSTER SEPARATION CHECK")
+    print("-" * 80)
+
+    merged = df_clean.merge(clusters, on='Descriptor')
+    cluster1_means = merged[merged['cluster'] == 1][dimension_cols].mean()
+    cluster2_means = merged[merged['cluster'] == 2][dimension_cols].mean()
+
+    differences = (cluster1_means - cluster2_means).abs()
+    significant_diffs = differences[differences > 0.5]
+    print(f"✓ Dimensions with meaningful difference (>0.5) between clusters: {len(significant_diffs)}/23")
+
+    if len(significant_diffs) < 5:
+        warnings_found.append(f"Only {len(significant_diffs)} dimensions show meaningful separation between clusters")
+
+    print(f"\n  Top 5 differentiating dimensions:")
+    for dim in differences.nlargest(5).index:
+        print(f"    - {dim}: {differences[dim]:.3f}")
+
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+
+    if len(errors_found) == 0:
+        print("✓ No critical errors found!")
+    else:
+        print(f"✗ {len(errors_found)} CRITICAL ERROR(S) FOUND:")
+        for i, error in enumerate(errors_found, 1):
+            print(f"  {i}. {error}")
+
+    if len(warnings_found) == 0:
+        print("✓ No warnings!")
+    else:
+        print(f"\n⚠ {len(warnings_found)} WARNING(S):")
+        for i, warning in enumerate(warnings_found, 1):
+            print(f"  {i}. {warning}")
+
+    print("\n" + "=" * 80)
+    print("REVIEW COMPLETE")
+    print("=" * 80)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+# Sync a readings dataset from a remote git repository, then regenerate CSV and analysis.
+#
+# Reads remote URL and subdirectory from a .sync_source file in the dataset directory.
+#
+# Usage:
+#   scripts/sync_readings.sh data/readings/manual_20260320
+#   scripts/sync_readings.sh data/readings/manual_20260320 --no-analysis
+#   scripts/sync_readings.sh data/readings/manual_20260320 --min-coverage 0.8
+#   scripts/sync_readings.sh data/readings/manual_20260320 --training data/readings/synthetic_20251116/readings.csv
+#
+# .sync_source format:
+#   REMOTE_URL=https://git.example.org/user/repo
+#   REMOTE_SUBDIR=readings
+
+set -euo pipefail
+
+DATASET_DIR="${1:?Usage: $0 <dataset_dir> [--no-analysis] [--min-coverage N]}"
+RUN_ANALYSIS=true
+MIN_COVERAGE=0.8
+TRAINING_CSV="data/readings/synthetic_20251116/readings.csv"
+
+shift || true
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --no-analysis) RUN_ANALYSIS=false ;;
+    --min-coverage) MIN_COVERAGE="$2"; shift ;;
+    --training) TRAINING_CSV="$2"; shift ;;
+    *) echo "Unknown option: $1" >&2; exit 1 ;;
+  esac
+  shift
+done
+
+SYNC_SOURCE="$DATASET_DIR/.sync_source"
+if [[ ! -f "$SYNC_SOURCE" ]]; then
+  echo "Error: $SYNC_SOURCE not found. Create it with REMOTE_URL and REMOTE_SUBDIR." >&2
+  exit 1
+fi
+
+# Load config
+REMOTE_URL=$(grep '^REMOTE_URL=' "$SYNC_SOURCE" | cut -d= -f2-)
+REMOTE_SUBDIR=$(grep '^REMOTE_SUBDIR=' "$SYNC_SOURCE" | cut -d= -f2-)
+
+if [[ -z "$REMOTE_URL" ]]; then
+  echo "Error: REMOTE_URL not set in $SYNC_SOURCE" >&2
+  exit 1
+fi
+
+REMOTE_SUBDIR="${REMOTE_SUBDIR:-readings}"
+JSON_DIR="$DATASET_DIR/json"
+
+echo "========================================"
+echo "Syncing: $DATASET_DIR"
+echo "From:    $REMOTE_URL/$REMOTE_SUBDIR"
+echo "========================================"
+
+# Clone remote to temp dir and copy JSON files
+TMPDIR=$(mktemp -d)
+trap "rm -rf '$TMPDIR'" EXIT
+
+echo ""
+echo "Fetching remote data..."
+git clone --depth 1 --quiet "$REMOTE_URL" "$TMPDIR"
+
+SRC="$TMPDIR/$REMOTE_SUBDIR"
+if [[ ! -d "$SRC" ]]; then
+  echo "Error: subdirectory '$REMOTE_SUBDIR' not found in remote repo." >&2
+  exit 1
+fi
+
+NEW=$(find "$SRC" -name '*.json' | wc -l | tr -d ' ')
+mkdir -p "$JSON_DIR"
+cp "$SRC"/*.json "$JSON_DIR"/
+echo "Copied $NEW JSON files → $JSON_DIR"
+
+# Determine VENV python
+PYTHON=python3
+if [[ -f ".venv/bin/python3" ]]; then
+  PYTHON=".venv/bin/python3"
+fi
+
+# Regenerate CSV
+echo ""
+echo "Regenerating readings.csv..."
+"$PYTHON" scripts/json_to_csv.py "$JSON_DIR" -o "$DATASET_DIR/readings.csv"
+
+if [[ "$RUN_ANALYSIS" == true ]]; then
+  echo ""
+  echo "Running multivariate analysis (--min-coverage $MIN_COVERAGE)..."
+  "$PYTHON" scripts/multivariate_analysis.py \
+    "$DATASET_DIR/readings.csv" \
+    --min-coverage "$MIN_COVERAGE" \
+    --analyses clustering pca correlation importance
+
+  echo ""
+  echo "Generating LDA visualization..."
+  "$PYTHON" scripts/lda_visualization.py "$DATASET_DIR/readings.csv"
+
+  echo ""
+  echo "Classifying readings (training: $TRAINING_CSV)..."
+  "$PYTHON" scripts/classify_readings.py \
+    "$DATASET_DIR/readings.csv" \
+    --training "$TRAINING_CSV"
+fi
+
+echo ""
+echo "Done. Dataset: $DATASET_DIR"
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""
+Create visualizations of k-means clusters overlaid on dimensionality reduction plots.
+
+Usage:
+  python3 scripts/visualize_clusters.py data/readings/synthetic_20251116.csv
+  python3 scripts/visualize_clusters.py data/readings/manual_20260101.csv --results-dir analysis_results/manual_20260101
+"""
+
+import argparse
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from pathlib import Path
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Visualize k-means clusters in PCA/t-SNE/UMAP space',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Example usage:
+  python3 scripts/visualize_clusters.py data/readings/synthetic_20251116/readings.csv
+  python3 scripts/visualize_clusters.py data/readings/manual_20260101/readings.csv --analysis-dir data/readings/manual_20260101/analysis
+        """
+    )
+    parser.add_argument('input_csv', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
+    parser.add_argument('--analysis-dir', default=None,
+                        help='Analysis directory (default: <dataset_dir>/analysis)')
+    args = parser.parse_args()
+
+    dataset_dir = Path(args.input_csv).parent
+    results_dir = Path(args.analysis_dir) if args.analysis_dir else dataset_dir / 'analysis'
+    plots_dir = results_dir / 'plots'
+    data_dir = results_dir / 'data'
+
+    # Load cluster assignments
+    clusters = pd.read_csv(data_dir / 'kmeans_clusters.csv')
+    clusters['cluster'] = clusters['cluster']  # Already 1-indexed
+
+    # Load dimensionality reduction coordinates
+    pca_coords = pd.read_csv(data_dir / 'pca_coordinates.csv')
+    tsne_coords = pd.read_csv(data_dir / 'tsne_coordinates.csv')
+
+    # Merge cluster assignments with coordinates
+    pca_data = pca_coords.merge(clusters, on='Descriptor')
+    tsne_data = tsne_coords.merge(clusters, on='Descriptor')
+
+    # Set up color scheme
+    colors = {1: '#2E86AB', 2: '#A23B72'}  # Blue for cluster 1, Purple for cluster 2
+    cluster_names = {1: 'Cluster 1: Relational/Cultural', 2: 'Cluster 2: Institutional/Bureaucratic'}
+
+    # ========== PCA Plot with Clusters ==========
+    print("Creating PCA plot with cluster colors...")
+    fig, ax = plt.subplots(figsize=(14, 12))
+
+    for cluster_id in [1, 2]:
+        cluster_data = pca_data[pca_data['cluster'] == cluster_id]
+        ax.scatter(cluster_data['PC1'], cluster_data['PC2'],
+                  c=colors[cluster_id], label=cluster_names[cluster_id],
+                  alpha=0.6, s=60, edgecolors='white', linewidth=0.5)
+
+    for cluster_id in [1, 2]:
+        cluster_data = pca_data[pca_data['cluster'] == cluster_id]
+        for i, row in cluster_data.iterrows():
+            if i % 8 == 0:
+                ax.annotate(row['Descriptor'],
+                           (row['PC1'], row['PC2']),
+                           fontsize=7, alpha=0.7,
+                           xytext=(5, 5), textcoords='offset points')
+
+    ax.set_xlabel('PC1 (22.5% variance)', fontsize=12)
+    ax.set_ylabel('PC2 (22.7% variance)', fontsize=12)
+    ax.set_title('K-Means Clusters in PCA Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold')
+    ax.legend(loc='best', fontsize=10, framealpha=0.9)
+    ax.grid(True, alpha=0.3)
+
+    plt.tight_layout()
+    plt.savefig(plots_dir / 'pca_2d_clustered.png', dpi=300, bbox_inches='tight')
+    print(f"  Saved: {plots_dir / 'pca_2d_clustered.png'}")
+    plt.close()
+
+    # ========== t-SNE Plot with Clusters ==========
+    print("Creating t-SNE plot with cluster colors...")
+    fig, ax = plt.subplots(figsize=(14, 12))
+
+    for cluster_id in [1, 2]:
+        cluster_data = tsne_data[tsne_data['cluster'] == cluster_id]
+        ax.scatter(cluster_data['TSNE1'], cluster_data['TSNE2'],
+                  c=colors[cluster_id], label=cluster_names[cluster_id],
+                  alpha=0.6, s=60, edgecolors='white', linewidth=0.5)
+
+    for cluster_id in [1, 2]:
+        cluster_data = tsne_data[tsne_data['cluster'] == cluster_id]
+        for i, row in cluster_data.iterrows():
+            if i % 8 == 0:
+                ax.annotate(row['Descriptor'],
+                           (row['TSNE1'], row['TSNE2']),
+                           fontsize=7, alpha=0.7,
+                           xytext=(5, 5), textcoords='offset points')
+
+    ax.set_xlabel('t-SNE Dimension 1', fontsize=12)
+    ax.set_ylabel('t-SNE Dimension 2', fontsize=12)
+    ax.set_title('K-Means Clusters in t-SNE Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold')
+    ax.legend(loc='best', fontsize=10, framealpha=0.9)
+    ax.grid(True, alpha=0.3)
+
+    plt.tight_layout()
+    plt.savefig(plots_dir / 'tsne_2d_clustered.png', dpi=300, bbox_inches='tight')
+    print(f"  Saved: {plots_dir / 'tsne_2d_clustered.png'}")
+    plt.close()
+
+    # ========== UMAP Plot with Clusters (if available) ==========
+    umap_path = data_dir / 'umap_coordinates.csv'
+    if umap_path.exists():
+        print("Creating UMAP plot with cluster colors...")
+        umap_coords = pd.read_csv(umap_path)
+        umap_data = umap_coords.merge(clusters, on='Descriptor')
+
+        fig, ax = plt.subplots(figsize=(14, 12))
+
+        for cluster_id in [1, 2]:
+            cluster_data = umap_data[umap_data['cluster'] == cluster_id]
+            ax.scatter(cluster_data['UMAP1'], cluster_data['UMAP2'],
+                      c=colors[cluster_id], label=cluster_names[cluster_id],
+                      alpha=0.6, s=60, edgecolors='white', linewidth=0.5)
+
+        for cluster_id in [1, 2]:
+            cluster_data = umap_data[umap_data['cluster'] == cluster_id]
+            for i, row in cluster_data.iterrows():
+                if i % 8 == 0:
+                    ax.annotate(row['Descriptor'],
+                               (row['UMAP1'], row['UMAP2']),
+                               fontsize=7, alpha=0.7,
+                               xytext=(5, 5), textcoords='offset points')
+
+        ax.set_xlabel('UMAP Dimension 1', fontsize=12)
+        ax.set_ylabel('UMAP Dimension 2', fontsize=12)
+        ax.set_title('K-Means Clusters in UMAP Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold')
+        ax.legend(loc='best', fontsize=10, framealpha=0.9)
+        ax.grid(True, alpha=0.3)
+
+        plt.tight_layout()
+        plt.savefig(plots_dir / 'umap_2d_clustered.png', dpi=300, bbox_inches='tight')
+        print(f"  Saved: {plots_dir / 'umap_2d_clustered.png'}")
+        plt.close()
+
+    # ========== Summary Statistics ==========
+    print("\n=== Cluster Summary ===")
+    print(f"Total protocols: {len(clusters)}")
+    print(f"\nCluster 1 (Relational/Cultural): {len(clusters[clusters['cluster'] == 1])} protocols")
+    print(f"Cluster 2 (Institutional/Bureaucratic): {len(clusters[clusters['cluster'] == 2])} protocols")
+
+    print("\nSample protocols from each cluster:")
+    print("\nCluster 1 (Relational/Cultural):")
+    for protocol in clusters[clusters['cluster'] == 1]['Descriptor'].head(10):
+        print(f"  - {protocol}")
+
+    print("\nCluster 2 (Institutional/Bureaucratic):")
+    for protocol in clusters[clusters['cluster'] == 2]['Descriptor'].head(10):
+        print(f"  - {protocol}")
+
+    print("\n=== Visualization Complete! ===")
+
+
+if __name__ == '__main__':
+    main()