protocol-bicorder/ascii_bicorder.py

#!/usr/bin/env python3
"""
Generate bicorder.txt from bicorder.json
"""

import json
import argparse
import sys
import os
from pathlib import Path


# Simple version-based approach
#
# The model includes a 'bicorder_version' field indicating which version of
# bicorder.json it was trained on. The code checks that versions match before
# calculating. This ensures the gradient structure is compatible.
#
# When bicorder.json changes (gradients added/removed/reordered), update the
# version number and retrain the model.


def load_classifier_model():
    """Load the LDA model from bicorder_model.json"""
    # Try to find the model file
    script_dir = Path(__file__).parent
    model_paths = [
        script_dir / 'analysis' / 'bicorder_model.json',
        script_dir / 'bicorder_model.json',
        Path('analysis/bicorder_model.json'),
        Path('bicorder_model.json'),
    ]

    for path in model_paths:
        if path.exists():
            with open(path, 'r') as f:
                return json.load(f)

    return None


def calculate_lda_score(values_array, model):
    """
    Calculate LDA score from an array of values using the model.

    Args:
        values_array: list of 23 values (1-9) in the order expected by the model
        model: loaded classifier model

    Returns:
        LDA score (float), or None if insufficient data
    """
    if model is None:
        return None

    if len(values_array) != len(model['dimensions']):
        return None

    # Standardize using model scaler
    mean = model['scaler']['mean']
    scale = model['scaler']['scale']
    scaled = [(values_array[i] - mean[i]) / scale[i] for i in range(len(values_array))]

    # Calculate LDA score: coef · x + intercept
    coef = model['lda']['coefficients']
    intercept = model['lda']['intercept']

    # Dot product
    lda_score = sum(coef[i] * scaled[i] for i in range(len(scaled))) + intercept

    return lda_score


def lda_score_to_scale(lda_score):
    """
    Convert LDA score to 1-9 scale.
    LDA scores typically range from -4 to +4 (8 range)
    Target scale is 1 to 9 (8 range)

    Formula: value = 5 + (lda_score * 4/3)
    - LDA -3 or less → 1 (bureaucratic)
    - LDA 0 → 5 (boundary)
    - LDA +3 or more → 9 (relational)
    """
    if lda_score is None:
        return None

    # Scale: value = 5 + (lda_score * 1.33)
    value = 5 + (lda_score * 4.0 / 3.0)

    # Clamp to 1-9 range and round
    value = max(1, min(9, value))
    return round(value)


def calculate_hardness(diagnostic_values):
    """Calculate hardness/softness (mean of all diagnostic values)"""
    if not diagnostic_values:
        return None

    valid_values = [v for v in diagnostic_values if v is not None]
    if not valid_values:
        return None

    return round(sum(valid_values) / len(valid_values))


def calculate_polarization(diagnostic_values):
    """
    Calculate polarization (1 = extreme, 9 = centrist).
    Measures how far values are from the center (5).
    """
    if not diagnostic_values:
        return None

    valid_values = [v for v in diagnostic_values if v is not None]
    if not valid_values:
        return None

    # Calculate mean distance from center
    distances = [abs(v - 5) for v in valid_values]
    mean_distance = sum(distances) / len(distances)

    # Convert to 1-9 scale (inverted: high distance = low value = polarized)
    # Maximum possible distance is 4 (from 1 or 9 to 5)
    # Scale: 1 (all at extremes) to 9 (all at center)
    polarization = 9 - (mean_distance / 4 * 8)

    return round(max(1, min(9, polarization)))


def calculate_automated_analysis(json_data):
    """
    Calculate values for automated analysis fields.
    Modifies json_data in place.
    """
    # Collect all diagnostic values in order
    diagnostic_values = []
    values_array = []

    for diagnostic_set in json_data.get("diagnostic", []):
        for gradient in diagnostic_set.get("gradients", []):
            value = gradient.get("value")
            if value is not None:
                diagnostic_values.append(value)
                values_array.append(float(value))
            else:
                # Fill missing with neutral value
                values_array.append(5.0)

    # Only calculate if we have diagnostic values
    if not diagnostic_values:
        return

    # Load classifier model
    model = load_classifier_model()

    # Check version compatibility
    bicorder_version = json_data.get("version", "unknown")
    model_version = model.get("bicorder_version", "unknown") if model else "unknown"

    version_mismatch = (model and bicorder_version != model_version)

    # Calculate each automated analysis field
    for analysis_item in json_data.get("analysis", []):
        if not analysis_item.get("automated", False):
            continue

        term_left = analysis_item.get("term_left", "")

        # Calculate based on the type
        if term_left == "hardness":
            analysis_item["value"] = calculate_hardness(diagnostic_values)
        elif term_left == "polarized":
            analysis_item["value"] = calculate_polarization(diagnostic_values)
        elif term_left == "bureaucratic":
            if version_mismatch:
                # Skip calculation if versions don't match
                print(f"Warning: Model version ({model_version}) doesn't match bicorder version ({bicorder_version}). Skipping bureaucratic/relational calculation.")
                analysis_item["value"] = None
            elif model:
                lda_score = calculate_lda_score(values_array, model)
                analysis_item["value"] = lda_score_to_scale(lda_score)


def center_text(text, width):
    """Center text within a given width"""
    return text.center(width)


def format_gradient_bar(value):
    """
    Format the gradient bar based on value.
    If value is None, show all dashes: [---------]
    If value is 1-9, show the number at its position: [----5----]
    """
    if value is None:
        return "[---------]"

    # Ensure value is in valid range (1-9)
    if not isinstance(value, int) or value < 1 or value > 9:
        return "[---------]"

    # Create bar with value at the correct position
    bars = [
        "[1--------]",  # value 1
        "[-2-------]",  # value 2
        "[--3------]",  # value 3
        "[---4-----]",  # value 4
        "[----5----]",  # value 5
        "[-----6---]",  # value 6
        "[------7--]",  # value 7
        "[-------8-]",  # value 8
        "[--------9]"   # value 9
    ]
    return bars[value - 1]


def format_gradient_line(term_left, term_right, value, left_width, right_width, center_width):
    """
    Format a gradient line with proper spacing.
    Example: "       explicit < [|||||||||] > implicit      "
    """
    bar = format_gradient_bar(value)
    # Right-align the left term, add the bar, then left-align the right term
    line = f"{term_left.rjust(left_width)} < {bar} > {term_right.ljust(right_width)}"
    return center_text(line, center_width)


def format_metadata_field(field_value, field_name):
    """
    Format metadata field - show value if provided, otherwise show field name in brackets
    """
    if field_value is None or field_value == "":
        return f"[{field_name}]"
    return str(field_value)


def generate_bicorder_text(json_data):
    """Generate the formatted bicorder text from JSON data"""
    lines = []

    # First pass: calculate maximum widths for left and right terms
    # Also collect all terms for the glossary
    max_left_width = 0
    max_right_width = 0
    glossary_terms = {}  # Dictionary to store term: description mappings

    # Check diagnostic gradients
    for diagnostic_set in json_data.get("diagnostic", []):
        for gradient in diagnostic_set.get("gradients", []):
            term_left = gradient.get("term_left", "")
            term_right = gradient.get("term_right", "")
            max_left_width = max(max_left_width, len(term_left))
            max_right_width = max(max_right_width, len(term_right))

            # Collect terms for glossary
            if term_left:
                glossary_terms[term_left] = gradient.get("term_left_description", "")
            if term_right:
                glossary_terms[term_right] = gradient.get("term_right_description", "")

    # Check analysis items
    for analysis_item in json_data.get("analysis", []):
        term_left = analysis_item.get("term_left", "")
        term_right = analysis_item.get("term_right", "")
        max_left_width = max(max_left_width, len(term_left))
        max_right_width = max(max_right_width, len(term_right))

        # Collect terms for glossary
        if term_left:
            glossary_terms[term_left] = analysis_item.get("term_left_description", "")
        if term_right:
            glossary_terms[term_right] = analysis_item.get("term_right_description", "")

    # Calculate the width needed for centering
    # Gradient line format: "{left_term} < [|||||||||] > {right_term}"
    # That's: max_left_width + 3 + 11 + 3 + max_right_width
    gradient_line_width = max_left_width + max_right_width + 17

    # Also check metadata and headers
    metadata = json_data.get("metadata", {})
    max_text_width = max(
        len("Protocol"),
        len("BICORDER"),
        len(format_metadata_field(metadata.get("protocol"), "Protocol")),
        len(format_metadata_field(metadata.get("description"), "Description")),
        len(format_metadata_field(metadata.get("analyst"), "Analyst")),
        len(format_metadata_field(metadata.get("standpoint"), "Standpoint")),
        len(format_metadata_field(metadata.get("timestamp"), "Timestamp")),
        len("ANALYSIS")
    )

    # Check diagnostic set names
    for diagnostic_set in json_data.get("diagnostic", []):
        set_name = diagnostic_set.get("set_name", "").upper()
        max_text_width = max(max_text_width, len(set_name))

    # Use the maximum of gradient line width and text width
    center_width = max(gradient_line_width, max_text_width)

    # Header
    lines.append(center_text("Protocol", center_width))
    lines.append(center_text("BICORDER", center_width))
    lines.append("")

    # Metadata section
    lines.append(center_text(format_metadata_field(metadata.get("protocol"), "Protocol"), center_width))
    lines.append(center_text(format_metadata_field(metadata.get("description"), "Description"), center_width))
    lines.append(center_text(format_metadata_field(metadata.get("analyst"), "Analyst"), center_width))
    lines.append(center_text(format_metadata_field(metadata.get("standpoint"), "Standpoint"), center_width))
    lines.append(center_text(format_metadata_field(metadata.get("timestamp"), "Timestamp"), center_width))
    lines.append("")

    # Diagnostic sections
    for diagnostic_set in json_data.get("diagnostic", []):
        set_name = diagnostic_set.get("set_name", "").upper()
        lines.append(center_text(set_name, center_width))

        for gradient in diagnostic_set.get("gradients", []):
            term_left = gradient.get("term_left", "")
            term_right = gradient.get("term_right", "")
            value = gradient.get("value")

            lines.append(format_gradient_line(term_left, term_right, value, max_left_width, max_right_width, center_width))

        lines.append("")

    # Analysis section
    lines.append(center_text("ANALYSIS", center_width))
    for analysis_item in json_data.get("analysis", []):
        term_left = analysis_item.get("term_left", "")
        term_right = analysis_item.get("term_right", "")
        value = analysis_item.get("value")

        lines.append(format_gradient_line(term_left, term_right, value, max_left_width, max_right_width, center_width))

    lines.append("")

    # Glossary section
    lines.append(center_text("GLOSSARY", center_width))
    lines.append("")

    # Generate pandoc-compatible table
    # Sort terms alphabetically (case-insensitive) for consistent output
    sorted_terms = sorted(glossary_terms.items(), key=lambda x: x[0].lower())

    if sorted_terms:
        # Calculate column widths for the table
        max_term_width = max(len(term) for term, _ in sorted_terms)
        max_term_width = max(max_term_width, len("Term"))  # At least as wide as header

        # Build the table
        # Header row
        lines.append(f"| {'Term'.ljust(max_term_width)} | Description |")
        # Separator row
        lines.append(f"| {'-' * max_term_width} | {'-' * 11} |")

        # Data rows
        for term, description in sorted_terms:
            lines.append(f"| {term.ljust(max_term_width)} | {description} |")

        lines.append("")

    return "\n".join(lines)


def main():
    """Main function to read JSON and generate text output"""
    # Set up argument parser
    parser = argparse.ArgumentParser(
        description="Generate formatted bicorder text from JSON input"
    )
    parser.add_argument(
        "input_json",
        help="Path to input JSON file"
    )
    parser.add_argument(
        "output_txt",
        help="Path to output TXT file"
    )

    args = parser.parse_args()

    # Read the JSON file
    try:
        with open(args.input_json, "r") as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: Input file '{args.input_json}' not found.", file=sys.stderr)
        sys.exit(1)
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON in '{args.input_json}': {e}", file=sys.stderr)
        sys.exit(1)

    # Calculate automated analysis values
    calculate_automated_analysis(data)

    # Generate the formatted text
    output = generate_bicorder_text(data)

    # Write to output file
    try:
        with open(args.output_txt, "w") as f:
            f.write(output)
        print(f"Successfully generated '{args.output_txt}'")
    except IOError as e:
        print(f"Error: Could not write to '{args.output_txt}': {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()