Reorganize directory, add manual dataset and sync tooling

- Move all scripts to scripts/, web assets to web/, analysis results
  into self-contained data/readings/<type>_<YYYYMMDD>/ directories
- Add data/readings/manual_20260320/ with 32 JSON readings from
  git.medlab.host/ntnsndr/protocol-bicorder-data
- Add scripts/json_to_csv.py to convert bicorder JSON files to CSV
- Add scripts/sync_readings.sh for one-command sync + re-analysis of
  any dataset backed by a .sync_source config file
- Add scripts/classify_readings.py to apply the LDA classifier to all
  readings and save per-reading cluster assignments
- Add --min-coverage flag to multivariate_analysis.py for sparse/shortform
  datasets; also applies in lda_visualization.py
- Fix lda_visualization.py NaN handling and 0-d array annotation bug
- Update README.md and WORKFLOW.md to document datasets, sync workflow,
  shortform handling, and new scripts

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Nathan Schneider
2026-03-20 17:35:13 -06:00
parent 0c794dddae
commit 897c30406b
545 changed files with 10715 additions and 718 deletions

View File

@@ -0,0 +1,102 @@
#!/usr/bin/env python3
"""
Apply the BicorderClassifier to all readings in a CSV and save results.
Uses the synthetic-trained LDA model by default. Missing dimensions are
filled with the neutral value (5), so shortform readings can still be
classified — though with lower confidence.
Usage:
python3 scripts/classify_readings.py data/readings/manual_20260320/readings.csv
python3 scripts/classify_readings.py data/readings/manual_20260320/readings.csv \\
--training data/readings/synthetic_20251116/readings.csv \\
--output data/readings/manual_20260320/analysis/classifications.csv
"""
import argparse
import csv
from pathlib import Path
import pandas as pd
from bicorder_classifier import BicorderClassifier
def main():
parser = argparse.ArgumentParser(
description='Classify all readings in a CSV using the BicorderClassifier'
)
parser.add_argument('input_csv', help='Readings CSV to classify')
parser.add_argument(
'--training',
default='data/readings/synthetic_20251116/readings.csv',
help='Training CSV for classifier (default: synthetic_20251116)'
)
parser.add_argument(
'--output', default=None,
help='Output CSV path (default: <dataset>/analysis/classifications.csv)'
)
args = parser.parse_args()
input_path = Path(args.input_csv)
output_path = (
Path(args.output) if args.output
else input_path.parent / 'analysis' / 'classifications.csv'
)
output_path.parent.mkdir(parents=True, exist_ok=True)
print(f"Loading classifier (training: {args.training})...")
classifier = BicorderClassifier(diagnostic_csv=args.training)
df = pd.read_csv(input_path)
print(f"Classifying {len(df)} readings from {input_path}...")
rows = []
for _, record in df.iterrows():
# Build ratings dict from dimension columns only
ratings = {
col: float(record[col])
for col in classifier.DIMENSIONS
if col in record and pd.notna(record[col])
}
result = classifier.predict(ratings, return_details=True)
rows.append({
'Descriptor': record.get('Descriptor', ''),
'analyst': record.get('analyst', ''),
'standpoint': record.get('standpoint', ''),
'shortform': record.get('shortform', ''),
'cluster': result['cluster'],
'cluster_name': result['cluster_name'],
'confidence': round(result['confidence'], 3),
'lda_score': round(result['lda_score'], 3),
'distance_to_boundary': round(result['distance_to_boundary'], 3),
'completeness': round(result['completeness'], 3),
'dimensions_provided': result['dimensions_provided'],
'key_dims_provided': result['key_dimensions_provided'],
'recommended_form': result['recommended_form'],
})
out_df = pd.DataFrame(rows)
out_df.to_csv(output_path, index=False)
print(f"Classifications saved → {output_path}")
# Summary
counts = out_df['cluster_name'].value_counts()
print(f"\nCluster summary:")
for name, count in counts.items():
pct = count / len(out_df) * 100
print(f" {name}: {count} ({pct:.0f}%)")
low_conf = (out_df['confidence'] < 0.4).sum()
if low_conf:
print(f"\n {low_conf} readings with low confidence (<0.4) — may be boundary cases")
shortform_count = out_df[out_df['shortform'].astype(str) == 'True'].shape[0]
if shortform_count:
print(f"\n {shortform_count} shortform readings classified (missing dims filled with neutral 5)")
if __name__ == '__main__':
main()