protocol-bicorder/analysis/bicorder_analyze.py

#!/usr/bin/env python3
"""
Protocol Bicorder Analysis Script

Processes a two-column CSV file (protocol descriptor and description) and adds
columns for each diagnostic gradient from bicorder.json. Values to be filled
by LLM commands.
"""

import csv
import json
import sys
import argparse
from pathlib import Path


def load_bicorder_config(bicorder_path):
    """Load and parse the bicorder.json configuration file."""
    with open(bicorder_path, 'r') as f:
        return json.load(f)


def extract_gradients(bicorder_data):
    """Extract all gradients from the diagnostic sets."""
    gradients = []
    for diagnostic_set in bicorder_data['diagnostic']:
        set_name = diagnostic_set['set_name']

        for gradient in diagnostic_set['gradients']:
            # Create a unique column name for this gradient
            col_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
            gradients.append({
                'column_name': col_name,
                'set_name': set_name,
                'term_left': gradient['term_left'],
                'term_left_description': gradient['term_left_description'],
                'term_right': gradient['term_right'],
                'term_right_description': gradient['term_right_description']
            })

    return gradients


def process_csv(input_csv, output_csv, bicorder_path, analyst=None, standpoint=None):
    """
    Process the input CSV and add gradient columns.

    Args:
        input_csv: Path to input CSV file
        output_csv: Path to output CSV file
        bicorder_path: Path to bicorder.json file
        analyst: Optional analyst name
        standpoint: Optional standpoint description
    """
    # Load bicorder configuration
    bicorder_data = load_bicorder_config(bicorder_path)
    gradients = extract_gradients(bicorder_data)

    with open(input_csv, 'r', encoding='utf-8') as infile, \
         open(output_csv, 'w', newline='', encoding='utf-8') as outfile:

        reader = csv.DictReader(infile)

        # Get original fieldnames from input CSV, filter out None/empty
        original_fields = [f for f in reader.fieldnames if f and f.strip()]

        # Add gradient columns and metadata columns
        gradient_columns = [g['column_name'] for g in gradients]
        output_fields = list(original_fields) + gradient_columns

        # Add metadata columns if provided
        if analyst is not None:
            output_fields.append('analyst')
        if standpoint is not None:
            output_fields.append('standpoint')

        writer = csv.DictWriter(outfile, fieldnames=output_fields)
        writer.writeheader()

        # Process each protocol row
        row_count = 0
        for protocol_row in reader:
            # Start with original row data, filter out None keys
            output_row = {k: v for k, v in protocol_row.items() if k and k.strip()}

            # Initialize all gradient columns as empty (to be filled by LLM)
            for gradient in gradients:
                output_row[gradient['column_name']] = ''

            # Add metadata if provided
            if analyst is not None:
                output_row['analyst'] = analyst
            if standpoint is not None:
                output_row['standpoint'] = standpoint

            writer.writerow(output_row)
            row_count += 1

            descriptor = protocol_row.get('Descriptor', '').strip()
            print(f"Processed protocol {row_count}: {descriptor}")

    print(f"\nOutput written to: {output_csv}")
    print(f"Total protocols: {row_count}")
    print(f"Gradient columns added: {len(gradients)}")
    print(f"\nGradient columns:")
    for i, gradient in enumerate(gradients, 1):
        print(f"  {i}. {gradient['column_name']}")


def main():
    parser = argparse.ArgumentParser(
        description='Process protocol CSV and add bicorder diagnostic columns',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Example usage:
  python3 bicorder_analyze.py protocols_edited.csv -o output.csv
  python3 bicorder_analyze.py protocols_raw.csv -o output.csv -a "Jane Doe" -s "Researcher perspective"

The script will preserve all original columns and add one column per diagnostic gradient.
Each gradient column will be empty, ready to be filled by LLM commands.
        """
    )

    parser.add_argument('input_csv', help='Input CSV file with protocol data')
    parser.add_argument('-o', '--output', required=True, help='Output CSV file')
    parser.add_argument('-b', '--bicorder',
                        default='../bicorder.json',
                        help='Path to bicorder.json (default: ../bicorder.json)')
    parser.add_argument('-a', '--analyst', help='Analyst name (adds analyst column)')
    parser.add_argument('-s', '--standpoint', help='Analyst standpoint (adds standpoint column)')

    args = parser.parse_args()

    # Validate input file exists
    if not Path(args.input_csv).exists():
        print(f"Error: Input file '{args.input_csv}' not found", file=sys.stderr)
        sys.exit(1)

    # Validate bicorder.json exists
    if not Path(args.bicorder).exists():
        print(f"Error: Bicorder config '{args.bicorder}' not found", file=sys.stderr)
        sys.exit(1)

    # Process the CSV
    process_csv(
        args.input_csv,
        args.output,
        args.bicorder,
        args.analyst,
        args.standpoint
    )


if __name__ == '__main__':
    main()