protocol-bicorder/analysis/bicorder_batch.py

#!/usr/bin/env python3
"""
Batch process all protocols in a CSV using the Bicorder framework.

This script orchestrates the entire analysis workflow:
1. Creates output CSV with gradient columns
2. For each protocol row:
   - Queries all 23 gradients (each in a new chat)
   - Updates CSV with results
"""

import csv
import json
import sys
import argparse
import subprocess
from pathlib import Path


def count_csv_rows(csv_path):
    """Count the number of data rows in a CSV file."""
    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        return sum(1 for _ in reader)


def run_bicorder_analyze(input_csv, output_csv, bicorder_path, analyst=None, standpoint=None):
    """Run bicorder_analyze.py to create output CSV."""
    cmd = ['python3', 'bicorder_analyze.py', input_csv, '-o', output_csv, '-b', bicorder_path]

    if analyst:
        cmd.extend(['-a', analyst])
    if standpoint:
        cmd.extend(['-s', standpoint])

    print(f"Creating analysis CSV: {output_csv}")
    result = subprocess.run(cmd, capture_output=True, text=True)

    if result.returncode != 0:
        print(f"Error creating CSV: {result.stderr}", file=sys.stderr)
        return False

    print(result.stdout)
    return True


def query_gradients(output_csv, row_num, bicorder_path, model=None):
    """Query all gradients for a protocol row."""
    cmd = ['python3', 'bicorder_query.py', output_csv, str(row_num),
           '-b', bicorder_path]

    if model:
        cmd.extend(['-m', model])

    print(f"Starting gradient queries...")

    # Don't capture output - let it print in real-time for progress visibility
    result = subprocess.run(cmd)

    if result.returncode != 0:
        print(f"Error querying gradients", file=sys.stderr)
        return False

    return True


def process_protocol_row(input_csv, output_csv, row_num, total_rows, bicorder_path, model=None):
    """Process a single protocol row through the complete workflow."""
    print(f"\n{'='*60}")
    print(f"Row {row_num}/{total_rows}")
    print(f"{'='*60}")

    # Query all gradients (each gradient gets a new chat)
    if not query_gradients(output_csv, row_num, bicorder_path, model):
        print(f"[FAILED] Could not query gradients")
        return False

    print(f"✓ Row {row_num} complete")
    return True


def main():
    parser = argparse.ArgumentParser(
        description='Batch process protocols through Bicorder analysis (each gradient uses a new chat)',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Example usage:
  # Process all protocols
  python3 bicorder_batch.py protocols_edited.csv -o analysis_output.csv

  # Process specific rows
  python3 bicorder_batch.py protocols_edited.csv -o analysis_output.csv --start 1 --end 5

  # With specific model
  python3 bicorder_batch.py protocols_edited.csv -o analysis_output.csv -m mistral

  # With metadata
  python3 bicorder_batch.py protocols_edited.csv -o analysis_output.csv -a "Your Name" -s "Your standpoint"
        """
    )

    parser.add_argument('input_csv', help='Input CSV file with protocol data')
    parser.add_argument('-o', '--output', required=True, help='Output CSV file')
    parser.add_argument('-b', '--bicorder',
                        default='../bicorder.json',
                        help='Path to bicorder.json (default: ../bicorder.json)')
    parser.add_argument('-m', '--model', help='LLM model to use')
    parser.add_argument('-a', '--analyst', help='Analyst name')
    parser.add_argument('-s', '--standpoint', help='Analyst standpoint')
    parser.add_argument('--start', type=int, default=1,
                        help='Start row number (1-indexed, default: 1)')
    parser.add_argument('--end', type=int,
                        help='End row number (1-indexed, default: all rows)')
    parser.add_argument('--resume', action='store_true',
                        help='Resume from existing output CSV (skip rows with values)')

    args = parser.parse_args()

    # Validate input file exists
    if not Path(args.input_csv).exists():
        print(f"Error: Input file '{args.input_csv}' not found", file=sys.stderr)
        sys.exit(1)

    # Validate bicorder.json exists
    if not Path(args.bicorder).exists():
        print(f"Error: Bicorder config '{args.bicorder}' not found", file=sys.stderr)
        sys.exit(1)

    # Count rows in input CSV
    total_rows = count_csv_rows(args.input_csv)
    end_row = args.end if args.end else total_rows

    if args.start > total_rows or end_row > total_rows:
        print(f"Error: Row range exceeds CSV size ({total_rows} rows)", file=sys.stderr)
        sys.exit(1)

    print(f"Bicorder Batch Analysis")
    print(f"Input: {args.input_csv} ({total_rows} protocols)")
    print(f"Output: {args.output}")
    print(f"Processing rows: {args.start} to {end_row}")
    if args.model:
        print(f"Model: {args.model}")
    print()

    # Step 1: Create output CSV (unless resuming)
    if not args.resume or not Path(args.output).exists():
        if not run_bicorder_analyze(args.input_csv, args.output, args.bicorder,
                                     args.analyst, args.standpoint):
            sys.exit(1)
    else:
        print(f"Resuming from existing CSV: {args.output}")

    # Step 2: Process each protocol row
    success_count = 0
    fail_count = 0

    for row_num in range(args.start, end_row + 1):
        if process_protocol_row(args.input_csv, args.output, row_num, end_row,
                                args.bicorder, args.model):
            success_count += 1
        else:
            fail_count += 1
            print(f"[WARNING] Row {row_num} failed, continuing...")

    # Summary
    print(f"\n{'='*60}")
    print(f"BATCH COMPLETE")
    print(f"{'='*60}")
    print(f"Successful: {success_count}")
    print(f"Failed: {fail_count}")
    print(f"Output: {args.output}")


if __name__ == '__main__':
    main()