Reorganize directory, add manual dataset and sync tooling

- Move all scripts to scripts/, web assets to web/, analysis results into self-contained data/readings/<type>_<YYYYMMDD>/ directories - Add data/readings/manual_20260320/ with 32 JSON readings from git.medlab.host/ntnsndr/protocol-bicorder-data - Add scripts/json_to_csv.py to convert bicorder JSON files to CSV - Add scripts/sync_readings.sh for one-command sync + re-analysis of any dataset backed by a .sync_source config file - Add scripts/classify_readings.py to apply the LDA classifier to all readings and save per-reading cluster assignments - Add --min-coverage flag to multivariate_analysis.py for sparse/shortform datasets; also applies in lda_visualization.py - Fix lda_visualization.py NaN handling and 0-d array annotation bug - Update README.md and WORKFLOW.md to document datasets, sync workflow, shortform handling, and new scripts Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-20 17:35:13 -06:00
parent 0c794dddae
commit 897c30406b
545 changed files with 10715 additions and 718 deletions
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""
+Apply the BicorderClassifier to all readings in a CSV and save results.
+
+Uses the synthetic-trained LDA model by default. Missing dimensions are
+filled with the neutral value (5), so shortform readings can still be
+classified — though with lower confidence.
+
+Usage:
+  python3 scripts/classify_readings.py data/readings/manual_20260320/readings.csv
+  python3 scripts/classify_readings.py data/readings/manual_20260320/readings.csv \\
+      --training data/readings/synthetic_20251116/readings.csv \\
+      --output data/readings/manual_20260320/analysis/classifications.csv
+"""
+
+import argparse
+import csv
+from pathlib import Path
+
+import pandas as pd
+
+from bicorder_classifier import BicorderClassifier
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Classify all readings in a CSV using the BicorderClassifier'
+    )
+    parser.add_argument('input_csv', help='Readings CSV to classify')
+    parser.add_argument(
+        '--training',
+        default='data/readings/synthetic_20251116/readings.csv',
+        help='Training CSV for classifier (default: synthetic_20251116)'
+    )
+    parser.add_argument(
+        '--output', default=None,
+        help='Output CSV path (default: <dataset>/analysis/classifications.csv)'
+    )
+    args = parser.parse_args()
+
+    input_path = Path(args.input_csv)
+    output_path = (
+        Path(args.output) if args.output
+        else input_path.parent / 'analysis' / 'classifications.csv'
+    )
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    print(f"Loading classifier (training: {args.training})...")
+    classifier = BicorderClassifier(diagnostic_csv=args.training)
+
+    df = pd.read_csv(input_path)
+    print(f"Classifying {len(df)} readings from {input_path}...")
+
+    rows = []
+    for _, record in df.iterrows():
+        # Build ratings dict from dimension columns only
+        ratings = {
+            col: float(record[col])
+            for col in classifier.DIMENSIONS
+            if col in record and pd.notna(record[col])
+        }
+
+        result = classifier.predict(ratings, return_details=True)
+
+        rows.append({
+            'Descriptor': record.get('Descriptor', ''),
+            'analyst': record.get('analyst', ''),
+            'standpoint': record.get('standpoint', ''),
+            'shortform': record.get('shortform', ''),
+            'cluster': result['cluster'],
+            'cluster_name': result['cluster_name'],
+            'confidence': round(result['confidence'], 3),
+            'lda_score': round(result['lda_score'], 3),
+            'distance_to_boundary': round(result['distance_to_boundary'], 3),
+            'completeness': round(result['completeness'], 3),
+            'dimensions_provided': result['dimensions_provided'],
+            'key_dims_provided': result['key_dimensions_provided'],
+            'recommended_form': result['recommended_form'],
+        })
+
+    out_df = pd.DataFrame(rows)
+    out_df.to_csv(output_path, index=False)
+    print(f"Classifications saved → {output_path}")
+
+    # Summary
+    counts = out_df['cluster_name'].value_counts()
+    print(f"\nCluster summary:")
+    for name, count in counts.items():
+        pct = count / len(out_df) * 100
+        print(f"  {name}: {count} ({pct:.0f}%)")
+
+    low_conf = (out_df['confidence'] < 0.4).sum()
+    if low_conf:
+        print(f"\n  {low_conf} readings with low confidence (<0.4) — may be boundary cases")
+
+    shortform_count = out_df[out_df['shortform'].astype(str) == 'True'].shape[0]
+    if shortform_count:
+        print(f"\n  {shortform_count} shortform readings classified (missing dims filled with neutral 5)")
+
+
+if __name__ == '__main__':
+    main()