Set up analysis scripts
This commit is contained in:
155
analysis/bicorder_analyze.py
Normal file
155
analysis/bicorder_analyze.py
Normal file
@@ -0,0 +1,155 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Protocol Bicorder Analysis Script
|
||||
|
||||
Processes a two-column CSV file (protocol descriptor and description) and adds
|
||||
columns for each diagnostic gradient from bicorder.json. Values to be filled
|
||||
by LLM commands.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_bicorder_config(bicorder_path):
|
||||
"""Load and parse the bicorder.json configuration file."""
|
||||
with open(bicorder_path, 'r') as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def extract_gradients(bicorder_data):
|
||||
"""Extract all gradients from the diagnostic sets."""
|
||||
gradients = []
|
||||
for diagnostic_set in bicorder_data['diagnostic']:
|
||||
set_name = diagnostic_set['set_name']
|
||||
|
||||
for gradient in diagnostic_set['gradients']:
|
||||
# Create a unique column name for this gradient
|
||||
col_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
|
||||
gradients.append({
|
||||
'column_name': col_name,
|
||||
'set_name': set_name,
|
||||
'term_left': gradient['term_left'],
|
||||
'term_left_description': gradient['term_left_description'],
|
||||
'term_right': gradient['term_right'],
|
||||
'term_right_description': gradient['term_right_description']
|
||||
})
|
||||
|
||||
return gradients
|
||||
|
||||
|
||||
def process_csv(input_csv, output_csv, bicorder_path, analyst=None, standpoint=None):
|
||||
"""
|
||||
Process the input CSV and add gradient columns.
|
||||
|
||||
Args:
|
||||
input_csv: Path to input CSV file
|
||||
output_csv: Path to output CSV file
|
||||
bicorder_path: Path to bicorder.json file
|
||||
analyst: Optional analyst name
|
||||
standpoint: Optional standpoint description
|
||||
"""
|
||||
# Load bicorder configuration
|
||||
bicorder_data = load_bicorder_config(bicorder_path)
|
||||
gradients = extract_gradients(bicorder_data)
|
||||
|
||||
with open(input_csv, 'r', encoding='utf-8') as infile, \
|
||||
open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
|
||||
|
||||
reader = csv.DictReader(infile)
|
||||
|
||||
# Get original fieldnames from input CSV, filter out None/empty
|
||||
original_fields = [f for f in reader.fieldnames if f and f.strip()]
|
||||
|
||||
# Add gradient columns and metadata columns
|
||||
gradient_columns = [g['column_name'] for g in gradients]
|
||||
output_fields = list(original_fields) + gradient_columns
|
||||
|
||||
# Add metadata columns if provided
|
||||
if analyst is not None:
|
||||
output_fields.append('analyst')
|
||||
if standpoint is not None:
|
||||
output_fields.append('standpoint')
|
||||
|
||||
writer = csv.DictWriter(outfile, fieldnames=output_fields)
|
||||
writer.writeheader()
|
||||
|
||||
# Process each protocol row
|
||||
row_count = 0
|
||||
for protocol_row in reader:
|
||||
# Start with original row data, filter out None keys
|
||||
output_row = {k: v for k, v in protocol_row.items() if k and k.strip()}
|
||||
|
||||
# Initialize all gradient columns as empty (to be filled by LLM)
|
||||
for gradient in gradients:
|
||||
output_row[gradient['column_name']] = ''
|
||||
|
||||
# Add metadata if provided
|
||||
if analyst is not None:
|
||||
output_row['analyst'] = analyst
|
||||
if standpoint is not None:
|
||||
output_row['standpoint'] = standpoint
|
||||
|
||||
writer.writerow(output_row)
|
||||
row_count += 1
|
||||
|
||||
descriptor = protocol_row.get('Descriptor', '').strip()
|
||||
print(f"Processed protocol {row_count}: {descriptor}")
|
||||
|
||||
print(f"\nOutput written to: {output_csv}")
|
||||
print(f"Total protocols: {row_count}")
|
||||
print(f"Gradient columns added: {len(gradients)}")
|
||||
print(f"\nGradient columns:")
|
||||
for i, gradient in enumerate(gradients, 1):
|
||||
print(f" {i}. {gradient['column_name']}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Process protocol CSV and add bicorder diagnostic columns',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Example usage:
|
||||
python3 bicorder_analyze.py protocols_edited.csv -o output.csv
|
||||
python3 bicorder_analyze.py protocols_raw.csv -o output.csv -a "Jane Doe" -s "Researcher perspective"
|
||||
|
||||
The script will preserve all original columns and add one column per diagnostic gradient.
|
||||
Each gradient column will be empty, ready to be filled by LLM commands.
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('input_csv', help='Input CSV file with protocol data')
|
||||
parser.add_argument('-o', '--output', required=True, help='Output CSV file')
|
||||
parser.add_argument('-b', '--bicorder',
|
||||
default='../bicorder.json',
|
||||
help='Path to bicorder.json (default: ../bicorder.json)')
|
||||
parser.add_argument('-a', '--analyst', help='Analyst name (adds analyst column)')
|
||||
parser.add_argument('-s', '--standpoint', help='Analyst standpoint (adds standpoint column)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate input file exists
|
||||
if not Path(args.input_csv).exists():
|
||||
print(f"Error: Input file '{args.input_csv}' not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Validate bicorder.json exists
|
||||
if not Path(args.bicorder).exists():
|
||||
print(f"Error: Bicorder config '{args.bicorder}' not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Process the CSV
|
||||
process_csv(
|
||||
args.input_csv,
|
||||
args.output,
|
||||
args.bicorder,
|
||||
args.analyst,
|
||||
args.standpoint
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user