- Move all scripts to scripts/, web assets to web/, analysis results into self-contained data/readings/<type>_<YYYYMMDD>/ directories - Add data/readings/manual_20260320/ with 32 JSON readings from git.medlab.host/ntnsndr/protocol-bicorder-data - Add scripts/json_to_csv.py to convert bicorder JSON files to CSV - Add scripts/sync_readings.sh for one-command sync + re-analysis of any dataset backed by a .sync_source config file - Add scripts/classify_readings.py to apply the LDA classifier to all readings and save per-reading cluster assignments - Add --min-coverage flag to multivariate_analysis.py for sparse/shortform datasets; also applies in lda_visualization.py - Fix lda_visualization.py NaN handling and 0-d array annotation bug - Update README.md and WORKFLOW.md to document datasets, sync workflow, shortform handling, and new scripts Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
103 lines
3.6 KiB
Python
103 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Apply the BicorderClassifier to all readings in a CSV and save results.
|
|
|
|
Uses the synthetic-trained LDA model by default. Missing dimensions are
|
|
filled with the neutral value (5), so shortform readings can still be
|
|
classified — though with lower confidence.
|
|
|
|
Usage:
|
|
python3 scripts/classify_readings.py data/readings/manual_20260320/readings.csv
|
|
python3 scripts/classify_readings.py data/readings/manual_20260320/readings.csv \\
|
|
--training data/readings/synthetic_20251116/readings.csv \\
|
|
--output data/readings/manual_20260320/analysis/classifications.csv
|
|
"""
|
|
|
|
import argparse
|
|
import csv
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
|
|
from bicorder_classifier import BicorderClassifier
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Classify all readings in a CSV using the BicorderClassifier'
|
|
)
|
|
parser.add_argument('input_csv', help='Readings CSV to classify')
|
|
parser.add_argument(
|
|
'--training',
|
|
default='data/readings/synthetic_20251116/readings.csv',
|
|
help='Training CSV for classifier (default: synthetic_20251116)'
|
|
)
|
|
parser.add_argument(
|
|
'--output', default=None,
|
|
help='Output CSV path (default: <dataset>/analysis/classifications.csv)'
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
input_path = Path(args.input_csv)
|
|
output_path = (
|
|
Path(args.output) if args.output
|
|
else input_path.parent / 'analysis' / 'classifications.csv'
|
|
)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"Loading classifier (training: {args.training})...")
|
|
classifier = BicorderClassifier(diagnostic_csv=args.training)
|
|
|
|
df = pd.read_csv(input_path)
|
|
print(f"Classifying {len(df)} readings from {input_path}...")
|
|
|
|
rows = []
|
|
for _, record in df.iterrows():
|
|
# Build ratings dict from dimension columns only
|
|
ratings = {
|
|
col: float(record[col])
|
|
for col in classifier.DIMENSIONS
|
|
if col in record and pd.notna(record[col])
|
|
}
|
|
|
|
result = classifier.predict(ratings, return_details=True)
|
|
|
|
rows.append({
|
|
'Descriptor': record.get('Descriptor', ''),
|
|
'analyst': record.get('analyst', ''),
|
|
'standpoint': record.get('standpoint', ''),
|
|
'shortform': record.get('shortform', ''),
|
|
'cluster': result['cluster'],
|
|
'cluster_name': result['cluster_name'],
|
|
'confidence': round(result['confidence'], 3),
|
|
'lda_score': round(result['lda_score'], 3),
|
|
'distance_to_boundary': round(result['distance_to_boundary'], 3),
|
|
'completeness': round(result['completeness'], 3),
|
|
'dimensions_provided': result['dimensions_provided'],
|
|
'key_dims_provided': result['key_dimensions_provided'],
|
|
'recommended_form': result['recommended_form'],
|
|
})
|
|
|
|
out_df = pd.DataFrame(rows)
|
|
out_df.to_csv(output_path, index=False)
|
|
print(f"Classifications saved → {output_path}")
|
|
|
|
# Summary
|
|
counts = out_df['cluster_name'].value_counts()
|
|
print(f"\nCluster summary:")
|
|
for name, count in counts.items():
|
|
pct = count / len(out_df) * 100
|
|
print(f" {name}: {count} ({pct:.0f}%)")
|
|
|
|
low_conf = (out_df['confidence'] < 0.4).sum()
|
|
if low_conf:
|
|
print(f"\n {low_conf} readings with low confidence (<0.4) — may be boundary cases")
|
|
|
|
shortform_count = out_df[out_df['shortform'].astype(str) == 'True'].shape[0]
|
|
if shortform_count:
|
|
print(f"\n {shortform_count} shortform readings classified (missing dims filled with neutral 5)")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|