protocol-bicorder/analysis/scripts/classify_readings.py

#!/usr/bin/env python3
"""
Apply the BicorderClassifier to all readings in a CSV and save results.

Uses the synthetic-trained LDA model by default. Missing dimensions are
filled with the neutral value (5), so shortform readings can still be
classified — though with lower confidence.

Usage:
  python3 scripts/classify_readings.py data/manual_20260320/readings.csv
  python3 scripts/classify_readings.py data/manual_20260320/readings.csv \\
      --training data/synthetic_20251116/readings.csv \\
      --output data/manual_20260320/analysis/classifications.csv
"""

import argparse
import csv
from pathlib import Path

import pandas as pd

from bicorder_classifier import BicorderClassifier


def main():
    parser = argparse.ArgumentParser(
        description='Classify all readings in a CSV using the BicorderClassifier'
    )
    parser.add_argument('input_csv', help='Readings CSV to classify')
    parser.add_argument(
        '--training',
        default='data/synthetic_20251116/readings.csv',
        help='Training CSV for classifier (default: synthetic_20251116)'
    )
    parser.add_argument(
        '--output', default=None,
        help='Output CSV path (default: <dataset>/analysis/classifications.csv)'
    )
    args = parser.parse_args()

    input_path = Path(args.input_csv)
    output_path = (
        Path(args.output) if args.output
        else input_path.parent / 'analysis' / 'classifications.csv'
    )
    output_path.parent.mkdir(parents=True, exist_ok=True)

    print(f"Loading classifier (training: {args.training})...")
    classifier = BicorderClassifier(diagnostic_csv=args.training)

    df = pd.read_csv(input_path)
    print(f"Classifying {len(df)} readings from {input_path}...")

    rows = []
    for _, record in df.iterrows():
        # Build ratings dict from dimension columns only
        ratings = {
            col: float(record[col])
            for col in classifier.DIMENSIONS
            if col in record and pd.notna(record[col])
        }

        result = classifier.predict(ratings, return_details=True)

        rows.append({
            'Descriptor': record.get('Descriptor', ''),
            'analyst': record.get('analyst', ''),
            'standpoint': record.get('standpoint', ''),
            'shortform': record.get('shortform', ''),
            'cluster': result['cluster'],
            'cluster_name': result['cluster_name'],
            'confidence': round(result['confidence'], 3),
            'lda_score': round(result['lda_score'], 3),
            'distance_to_boundary': round(result['distance_to_boundary'], 3),
            'completeness': round(result['completeness'], 3),
            'dimensions_provided': result['dimensions_provided'],
            'key_dims_provided': result['key_dimensions_provided'],
            'recommended_form': result['recommended_form'],
        })

    out_df = pd.DataFrame(rows)
    out_df.to_csv(output_path, index=False)
    print(f"Classifications saved → {output_path}")

    # Summary
    counts = out_df['cluster_name'].value_counts()
    print(f"\nCluster summary:")
    for name, count in counts.items():
        pct = count / len(out_df) * 100
        print(f"  {name}: {count} ({pct:.0f}%)")

    low_conf = (out_df['confidence'] < 0.4).sum()
    if low_conf:
        print(f"\n  {low_conf} readings with low confidence (<0.4) — may be boundary cases")

    shortform_count = out_df[out_df['shortform'].astype(str) == 'True'].shape[0]
    if shortform_count:
        print(f"\n  {shortform_count} shortform readings classified (missing dims filled with neutral 5)")


if __name__ == '__main__':
    main()