Remove the intermediate readings/ subdirectory level — dataset naming (synthetic_YYYYMMDD, manual_YYYYMMDD) already encodes what the data is. Update all path references across scripts and docs accordingly. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
103 lines
3.5 KiB
Python
103 lines
3.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Apply the BicorderClassifier to all readings in a CSV and save results.
|
|
|
|
Uses the synthetic-trained LDA model by default. Missing dimensions are
|
|
filled with the neutral value (5), so shortform readings can still be
|
|
classified — though with lower confidence.
|
|
|
|
Usage:
|
|
python3 scripts/classify_readings.py data/manual_20260320/readings.csv
|
|
python3 scripts/classify_readings.py data/manual_20260320/readings.csv \\
|
|
--training data/synthetic_20251116/readings.csv \\
|
|
--output data/manual_20260320/analysis/classifications.csv
|
|
"""
|
|
|
|
import argparse
|
|
import csv
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
|
|
from bicorder_classifier import BicorderClassifier
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Classify all readings in a CSV using the BicorderClassifier'
|
|
)
|
|
parser.add_argument('input_csv', help='Readings CSV to classify')
|
|
parser.add_argument(
|
|
'--training',
|
|
default='data/synthetic_20251116/readings.csv',
|
|
help='Training CSV for classifier (default: synthetic_20251116)'
|
|
)
|
|
parser.add_argument(
|
|
'--output', default=None,
|
|
help='Output CSV path (default: <dataset>/analysis/classifications.csv)'
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
input_path = Path(args.input_csv)
|
|
output_path = (
|
|
Path(args.output) if args.output
|
|
else input_path.parent / 'analysis' / 'classifications.csv'
|
|
)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"Loading classifier (training: {args.training})...")
|
|
classifier = BicorderClassifier(diagnostic_csv=args.training)
|
|
|
|
df = pd.read_csv(input_path)
|
|
print(f"Classifying {len(df)} readings from {input_path}...")
|
|
|
|
rows = []
|
|
for _, record in df.iterrows():
|
|
# Build ratings dict from dimension columns only
|
|
ratings = {
|
|
col: float(record[col])
|
|
for col in classifier.DIMENSIONS
|
|
if col in record and pd.notna(record[col])
|
|
}
|
|
|
|
result = classifier.predict(ratings, return_details=True)
|
|
|
|
rows.append({
|
|
'Descriptor': record.get('Descriptor', ''),
|
|
'analyst': record.get('analyst', ''),
|
|
'standpoint': record.get('standpoint', ''),
|
|
'shortform': record.get('shortform', ''),
|
|
'cluster': result['cluster'],
|
|
'cluster_name': result['cluster_name'],
|
|
'confidence': round(result['confidence'], 3),
|
|
'lda_score': round(result['lda_score'], 3),
|
|
'distance_to_boundary': round(result['distance_to_boundary'], 3),
|
|
'completeness': round(result['completeness'], 3),
|
|
'dimensions_provided': result['dimensions_provided'],
|
|
'key_dims_provided': result['key_dimensions_provided'],
|
|
'recommended_form': result['recommended_form'],
|
|
})
|
|
|
|
out_df = pd.DataFrame(rows)
|
|
out_df.to_csv(output_path, index=False)
|
|
print(f"Classifications saved → {output_path}")
|
|
|
|
# Summary
|
|
counts = out_df['cluster_name'].value_counts()
|
|
print(f"\nCluster summary:")
|
|
for name, count in counts.items():
|
|
pct = count / len(out_df) * 100
|
|
print(f" {name}: {count} ({pct:.0f}%)")
|
|
|
|
low_conf = (out_df['confidence'] < 0.4).sum()
|
|
if low_conf:
|
|
print(f"\n {low_conf} readings with low confidence (<0.4) — may be boundary cases")
|
|
|
|
shortform_count = out_df[out_df['shortform'].astype(str) == 'True'].shape[0]
|
|
if shortform_count:
|
|
print(f"\n {shortform_count} shortform readings classified (missing dims filled with neutral 5)")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|