Set up analysis scripts

2025-10-30 10:56:21 -06:00
parent d2da0425c6
commit 815ed9d6f4
14 changed files with 1427 additions and 651 deletions
--- a/analysis/bicorder_analyze.py
+++ b/analysis/bicorder_analyze.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+"""
+Protocol Bicorder Analysis Script
+
+Processes a two-column CSV file (protocol descriptor and description) and adds
+columns for each diagnostic gradient from bicorder.json. Values to be filled
+by LLM commands.
+"""
+
+import csv
+import json
+import sys
+import argparse
+from pathlib import Path
+
+
+def load_bicorder_config(bicorder_path):
+    """Load and parse the bicorder.json configuration file."""
+    with open(bicorder_path, 'r') as f:
+        return json.load(f)
+
+
+def extract_gradients(bicorder_data):
+    """Extract all gradients from the diagnostic sets."""
+    gradients = []
+    for diagnostic_set in bicorder_data['diagnostic']:
+        set_name = diagnostic_set['set_name']
+
+        for gradient in diagnostic_set['gradients']:
+            # Create a unique column name for this gradient
+            col_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
+            gradients.append({
+                'column_name': col_name,
+                'set_name': set_name,
+                'term_left': gradient['term_left'],
+                'term_left_description': gradient['term_left_description'],
+                'term_right': gradient['term_right'],
+                'term_right_description': gradient['term_right_description']
+            })
+
+    return gradients
+
+
+def process_csv(input_csv, output_csv, bicorder_path, analyst=None, standpoint=None):
+    """
+    Process the input CSV and add gradient columns.
+
+    Args:
+        input_csv: Path to input CSV file
+        output_csv: Path to output CSV file
+        bicorder_path: Path to bicorder.json file
+        analyst: Optional analyst name
+        standpoint: Optional standpoint description
+    """
+    # Load bicorder configuration
+    bicorder_data = load_bicorder_config(bicorder_path)
+    gradients = extract_gradients(bicorder_data)
+
+    with open(input_csv, 'r', encoding='utf-8') as infile, \
+         open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
+
+        reader = csv.DictReader(infile)
+
+        # Get original fieldnames from input CSV, filter out None/empty
+        original_fields = [f for f in reader.fieldnames if f and f.strip()]
+
+        # Add gradient columns and metadata columns
+        gradient_columns = [g['column_name'] for g in gradients]
+        output_fields = list(original_fields) + gradient_columns
+
+        # Add metadata columns if provided
+        if analyst is not None:
+            output_fields.append('analyst')
+        if standpoint is not None:
+            output_fields.append('standpoint')
+
+        writer = csv.DictWriter(outfile, fieldnames=output_fields)
+        writer.writeheader()
+
+        # Process each protocol row
+        row_count = 0
+        for protocol_row in reader:
+            # Start with original row data, filter out None keys
+            output_row = {k: v for k, v in protocol_row.items() if k and k.strip()}
+
+            # Initialize all gradient columns as empty (to be filled by LLM)
+            for gradient in gradients:
+                output_row[gradient['column_name']] = ''
+
+            # Add metadata if provided
+            if analyst is not None:
+                output_row['analyst'] = analyst
+            if standpoint is not None:
+                output_row['standpoint'] = standpoint
+
+            writer.writerow(output_row)
+            row_count += 1
+
+            descriptor = protocol_row.get('Descriptor', '').strip()
+            print(f"Processed protocol {row_count}: {descriptor}")
+
+    print(f"\nOutput written to: {output_csv}")
+    print(f"Total protocols: {row_count}")
+    print(f"Gradient columns added: {len(gradients)}")
+    print(f"\nGradient columns:")
+    for i, gradient in enumerate(gradients, 1):
+        print(f"  {i}. {gradient['column_name']}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Process protocol CSV and add bicorder diagnostic columns',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Example usage:
+  python3 bicorder_analyze.py protocols_edited.csv -o output.csv
+  python3 bicorder_analyze.py protocols_raw.csv -o output.csv -a "Jane Doe" -s "Researcher perspective"
+
+The script will preserve all original columns and add one column per diagnostic gradient.
+Each gradient column will be empty, ready to be filled by LLM commands.
+        """
+    )
+
+    parser.add_argument('input_csv', help='Input CSV file with protocol data')
+    parser.add_argument('-o', '--output', required=True, help='Output CSV file')
+    parser.add_argument('-b', '--bicorder',
+                        default='../bicorder.json',
+                        help='Path to bicorder.json (default: ../bicorder.json)')
+    parser.add_argument('-a', '--analyst', help='Analyst name (adds analyst column)')
+    parser.add_argument('-s', '--standpoint', help='Analyst standpoint (adds standpoint column)')
+
+    args = parser.parse_args()
+
+    # Validate input file exists
+    if not Path(args.input_csv).exists():
+        print(f"Error: Input file '{args.input_csv}' not found", file=sys.stderr)
+        sys.exit(1)
+
+    # Validate bicorder.json exists
+    if not Path(args.bicorder).exists():
+        print(f"Error: Bicorder config '{args.bicorder}' not found", file=sys.stderr)
+        sys.exit(1)
+
+    # Process the CSV
+    process_csv(
+        args.input_csv,
+        args.output,
+        args.bicorder,
+        args.analyst,
+        args.standpoint
+    )
+
+
+if __name__ == '__main__':
+    main()