Files
protocol-bicorder/analysis/scripts/classify_readings.py
Nathan Schneider 60e83783ec Flatten data/readings/ → data/
Remove the intermediate readings/ subdirectory level — dataset naming
(synthetic_YYYYMMDD, manual_YYYYMMDD) already encodes what the data is.
Update all path references across scripts and docs accordingly.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-20 17:46:23 -06:00

103 lines
3.5 KiB
Python

#!/usr/bin/env python3
"""
Apply the BicorderClassifier to all readings in a CSV and save results.
Uses the synthetic-trained LDA model by default. Missing dimensions are
filled with the neutral value (5), so shortform readings can still be
classified — though with lower confidence.
Usage:
python3 scripts/classify_readings.py data/manual_20260320/readings.csv
python3 scripts/classify_readings.py data/manual_20260320/readings.csv \\
--training data/synthetic_20251116/readings.csv \\
--output data/manual_20260320/analysis/classifications.csv
"""
import argparse
import csv
from pathlib import Path
import pandas as pd
from bicorder_classifier import BicorderClassifier
def main():
parser = argparse.ArgumentParser(
description='Classify all readings in a CSV using the BicorderClassifier'
)
parser.add_argument('input_csv', help='Readings CSV to classify')
parser.add_argument(
'--training',
default='data/synthetic_20251116/readings.csv',
help='Training CSV for classifier (default: synthetic_20251116)'
)
parser.add_argument(
'--output', default=None,
help='Output CSV path (default: <dataset>/analysis/classifications.csv)'
)
args = parser.parse_args()
input_path = Path(args.input_csv)
output_path = (
Path(args.output) if args.output
else input_path.parent / 'analysis' / 'classifications.csv'
)
output_path.parent.mkdir(parents=True, exist_ok=True)
print(f"Loading classifier (training: {args.training})...")
classifier = BicorderClassifier(diagnostic_csv=args.training)
df = pd.read_csv(input_path)
print(f"Classifying {len(df)} readings from {input_path}...")
rows = []
for _, record in df.iterrows():
# Build ratings dict from dimension columns only
ratings = {
col: float(record[col])
for col in classifier.DIMENSIONS
if col in record and pd.notna(record[col])
}
result = classifier.predict(ratings, return_details=True)
rows.append({
'Descriptor': record.get('Descriptor', ''),
'analyst': record.get('analyst', ''),
'standpoint': record.get('standpoint', ''),
'shortform': record.get('shortform', ''),
'cluster': result['cluster'],
'cluster_name': result['cluster_name'],
'confidence': round(result['confidence'], 3),
'lda_score': round(result['lda_score'], 3),
'distance_to_boundary': round(result['distance_to_boundary'], 3),
'completeness': round(result['completeness'], 3),
'dimensions_provided': result['dimensions_provided'],
'key_dims_provided': result['key_dimensions_provided'],
'recommended_form': result['recommended_form'],
})
out_df = pd.DataFrame(rows)
out_df.to_csv(output_path, index=False)
print(f"Classifications saved → {output_path}")
# Summary
counts = out_df['cluster_name'].value_counts()
print(f"\nCluster summary:")
for name, count in counts.items():
pct = count / len(out_df) * 100
print(f" {name}: {count} ({pct:.0f}%)")
low_conf = (out_df['confidence'] < 0.4).sum()
if low_conf:
print(f"\n {low_conf} readings with low confidence (<0.4) — may be boundary cases")
shortform_count = out_df[out_df['shortform'].astype(str) == 'True'].shape[0]
if shortform_count:
print(f"\n {shortform_count} shortform readings classified (missing dims filled with neutral 5)")
if __name__ == '__main__':
main()