Reorganize directory, add manual dataset and sync tooling
- Move all scripts to scripts/, web assets to web/, analysis results into self-contained data/readings/<type>_<YYYYMMDD>/ directories - Add data/readings/manual_20260320/ with 32 JSON readings from git.medlab.host/ntnsndr/protocol-bicorder-data - Add scripts/json_to_csv.py to convert bicorder JSON files to CSV - Add scripts/sync_readings.sh for one-command sync + re-analysis of any dataset backed by a .sync_source config file - Add scripts/classify_readings.py to apply the LDA classifier to all readings and save per-reading cluster assignments - Add --min-coverage flag to multivariate_analysis.py for sparse/shortform datasets; also applies in lda_visualization.py - Fix lda_visualization.py NaN handling and 0-d array annotation bug - Update README.md and WORKFLOW.md to document datasets, sync workflow, shortform handling, and new scripts Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
102
analysis/scripts/classify_readings.py
Normal file
102
analysis/scripts/classify_readings.py
Normal file
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Apply the BicorderClassifier to all readings in a CSV and save results.
|
||||
|
||||
Uses the synthetic-trained LDA model by default. Missing dimensions are
|
||||
filled with the neutral value (5), so shortform readings can still be
|
||||
classified — though with lower confidence.
|
||||
|
||||
Usage:
|
||||
python3 scripts/classify_readings.py data/readings/manual_20260320/readings.csv
|
||||
python3 scripts/classify_readings.py data/readings/manual_20260320/readings.csv \\
|
||||
--training data/readings/synthetic_20251116/readings.csv \\
|
||||
--output data/readings/manual_20260320/analysis/classifications.csv
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from bicorder_classifier import BicorderClassifier
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Classify all readings in a CSV using the BicorderClassifier'
|
||||
)
|
||||
parser.add_argument('input_csv', help='Readings CSV to classify')
|
||||
parser.add_argument(
|
||||
'--training',
|
||||
default='data/readings/synthetic_20251116/readings.csv',
|
||||
help='Training CSV for classifier (default: synthetic_20251116)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output', default=None,
|
||||
help='Output CSV path (default: <dataset>/analysis/classifications.csv)'
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
input_path = Path(args.input_csv)
|
||||
output_path = (
|
||||
Path(args.output) if args.output
|
||||
else input_path.parent / 'analysis' / 'classifications.csv'
|
||||
)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"Loading classifier (training: {args.training})...")
|
||||
classifier = BicorderClassifier(diagnostic_csv=args.training)
|
||||
|
||||
df = pd.read_csv(input_path)
|
||||
print(f"Classifying {len(df)} readings from {input_path}...")
|
||||
|
||||
rows = []
|
||||
for _, record in df.iterrows():
|
||||
# Build ratings dict from dimension columns only
|
||||
ratings = {
|
||||
col: float(record[col])
|
||||
for col in classifier.DIMENSIONS
|
||||
if col in record and pd.notna(record[col])
|
||||
}
|
||||
|
||||
result = classifier.predict(ratings, return_details=True)
|
||||
|
||||
rows.append({
|
||||
'Descriptor': record.get('Descriptor', ''),
|
||||
'analyst': record.get('analyst', ''),
|
||||
'standpoint': record.get('standpoint', ''),
|
||||
'shortform': record.get('shortform', ''),
|
||||
'cluster': result['cluster'],
|
||||
'cluster_name': result['cluster_name'],
|
||||
'confidence': round(result['confidence'], 3),
|
||||
'lda_score': round(result['lda_score'], 3),
|
||||
'distance_to_boundary': round(result['distance_to_boundary'], 3),
|
||||
'completeness': round(result['completeness'], 3),
|
||||
'dimensions_provided': result['dimensions_provided'],
|
||||
'key_dims_provided': result['key_dimensions_provided'],
|
||||
'recommended_form': result['recommended_form'],
|
||||
})
|
||||
|
||||
out_df = pd.DataFrame(rows)
|
||||
out_df.to_csv(output_path, index=False)
|
||||
print(f"Classifications saved → {output_path}")
|
||||
|
||||
# Summary
|
||||
counts = out_df['cluster_name'].value_counts()
|
||||
print(f"\nCluster summary:")
|
||||
for name, count in counts.items():
|
||||
pct = count / len(out_df) * 100
|
||||
print(f" {name}: {count} ({pct:.0f}%)")
|
||||
|
||||
low_conf = (out_df['confidence'] < 0.4).sum()
|
||||
if low_conf:
|
||||
print(f"\n {low_conf} readings with low confidence (<0.4) — may be boundary cases")
|
||||
|
||||
shortform_count = out_df[out_df['shortform'].astype(str) == 'True'].shape[0]
|
||||
if shortform_count:
|
||||
print(f"\n {shortform_count} shortform readings classified (missing dims filled with neutral 5)")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user