414 lines
14 KiB
Python
414 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate bicorder.txt from bicorder.json
|
|
"""
|
|
|
|
import json
|
|
import argparse
|
|
import sys
|
|
import os
|
|
from pathlib import Path
|
|
|
|
|
|
# Simple version-based approach
|
|
#
|
|
# The model includes a 'bicorder_version' field indicating which version of
|
|
# bicorder.json it was trained on. The code checks that versions match before
|
|
# calculating. This ensures the gradient structure is compatible.
|
|
#
|
|
# When bicorder.json changes (gradients added/removed/reordered), update the
|
|
# version number and retrain the model.
|
|
|
|
|
|
def load_classifier_model():
|
|
"""Load the LDA model from bicorder_model.json"""
|
|
# Try to find the model file
|
|
script_dir = Path(__file__).parent
|
|
model_paths = [
|
|
script_dir / 'analysis' / 'bicorder_model.json',
|
|
script_dir / 'bicorder_model.json',
|
|
Path('analysis/bicorder_model.json'),
|
|
Path('bicorder_model.json'),
|
|
]
|
|
|
|
for path in model_paths:
|
|
if path.exists():
|
|
with open(path, 'r') as f:
|
|
return json.load(f)
|
|
|
|
return None
|
|
|
|
|
|
def calculate_lda_score(values_array, model):
|
|
"""
|
|
Calculate LDA score from an array of values using the model.
|
|
|
|
Args:
|
|
values_array: list of 23 values (1-9) in the order expected by the model
|
|
model: loaded classifier model
|
|
|
|
Returns:
|
|
LDA score (float), or None if insufficient data
|
|
"""
|
|
if model is None:
|
|
return None
|
|
|
|
if len(values_array) != len(model['dimensions']):
|
|
return None
|
|
|
|
# Standardize using model scaler
|
|
mean = model['scaler']['mean']
|
|
scale = model['scaler']['scale']
|
|
scaled = [(values_array[i] - mean[i]) / scale[i] for i in range(len(values_array))]
|
|
|
|
# Calculate LDA score: coef · x + intercept
|
|
coef = model['lda']['coefficients']
|
|
intercept = model['lda']['intercept']
|
|
|
|
# Dot product
|
|
lda_score = sum(coef[i] * scaled[i] for i in range(len(scaled))) + intercept
|
|
|
|
return lda_score
|
|
|
|
|
|
def lda_score_to_scale(lda_score):
|
|
"""
|
|
Convert LDA score to 1-9 scale.
|
|
LDA scores typically range from -4 to +4 (8 range)
|
|
Target scale is 1 to 9 (8 range)
|
|
|
|
Formula: value = 5 + (lda_score * 4/3)
|
|
- LDA -3 or less → 1 (bureaucratic)
|
|
- LDA 0 → 5 (boundary)
|
|
- LDA +3 or more → 9 (relational)
|
|
"""
|
|
if lda_score is None:
|
|
return None
|
|
|
|
# Scale: value = 5 + (lda_score * 1.33)
|
|
value = 5 + (lda_score * 4.0 / 3.0)
|
|
|
|
# Clamp to 1-9 range and round
|
|
value = max(1, min(9, value))
|
|
return round(value)
|
|
|
|
|
|
def calculate_hardness(diagnostic_values):
|
|
"""Calculate hardness/softness (mean of all diagnostic values)"""
|
|
if not diagnostic_values:
|
|
return None
|
|
|
|
valid_values = [v for v in diagnostic_values if v is not None]
|
|
if not valid_values:
|
|
return None
|
|
|
|
return round(sum(valid_values) / len(valid_values))
|
|
|
|
|
|
def calculate_polarization(diagnostic_values):
|
|
"""
|
|
Calculate polarization (1 = extreme, 9 = centrist).
|
|
Measures how far values are from the center (5).
|
|
"""
|
|
if not diagnostic_values:
|
|
return None
|
|
|
|
valid_values = [v for v in diagnostic_values if v is not None]
|
|
if not valid_values:
|
|
return None
|
|
|
|
# Calculate mean distance from center
|
|
distances = [abs(v - 5) for v in valid_values]
|
|
mean_distance = sum(distances) / len(distances)
|
|
|
|
# Convert to 1-9 scale (inverted: high distance = low value = polarized)
|
|
# Maximum possible distance is 4 (from 1 or 9 to 5)
|
|
# Scale: 1 (all at extremes) to 9 (all at center)
|
|
polarization = 9 - (mean_distance / 4 * 8)
|
|
|
|
return round(max(1, min(9, polarization)))
|
|
|
|
|
|
def calculate_automated_analysis(json_data):
|
|
"""
|
|
Calculate values for automated analysis fields.
|
|
Modifies json_data in place.
|
|
"""
|
|
# Collect all diagnostic values in order
|
|
diagnostic_values = []
|
|
values_array = []
|
|
|
|
for diagnostic_set in json_data.get("diagnostic", []):
|
|
for gradient in diagnostic_set.get("gradients", []):
|
|
value = gradient.get("value")
|
|
if value is not None:
|
|
diagnostic_values.append(value)
|
|
values_array.append(float(value))
|
|
else:
|
|
# Fill missing with neutral value
|
|
values_array.append(5.0)
|
|
|
|
# Only calculate if we have diagnostic values
|
|
if not diagnostic_values:
|
|
return
|
|
|
|
# Load classifier model
|
|
model = load_classifier_model()
|
|
|
|
# Check version compatibility
|
|
bicorder_version = json_data.get("version", "unknown")
|
|
model_version = model.get("bicorder_version", "unknown") if model else "unknown"
|
|
|
|
version_mismatch = (model and bicorder_version != model_version)
|
|
|
|
# Calculate each automated analysis field
|
|
for analysis_item in json_data.get("analysis", []):
|
|
if not analysis_item.get("automated", False):
|
|
continue
|
|
|
|
term_left = analysis_item.get("term_left", "")
|
|
|
|
# Calculate based on the type
|
|
if term_left == "hardness":
|
|
analysis_item["value"] = calculate_hardness(diagnostic_values)
|
|
elif term_left == "polarized":
|
|
analysis_item["value"] = calculate_polarization(diagnostic_values)
|
|
elif term_left == "bureaucratic":
|
|
if version_mismatch:
|
|
# Skip calculation if versions don't match
|
|
print(f"Warning: Model version ({model_version}) doesn't match bicorder version ({bicorder_version}). Skipping bureaucratic/relational calculation.")
|
|
analysis_item["value"] = None
|
|
elif model:
|
|
lda_score = calculate_lda_score(values_array, model)
|
|
analysis_item["value"] = lda_score_to_scale(lda_score)
|
|
|
|
|
|
def center_text(text, width):
|
|
"""Center text within a given width"""
|
|
return text.center(width)
|
|
|
|
|
|
def format_gradient_bar(value):
|
|
"""
|
|
Format the gradient bar based on value.
|
|
If value is None, show all dashes: [---------]
|
|
If value is 1-9, show the number at its position: [----5----]
|
|
"""
|
|
if value is None:
|
|
return "[---------]"
|
|
|
|
# Ensure value is in valid range (1-9)
|
|
if not isinstance(value, int) or value < 1 or value > 9:
|
|
return "[---------]"
|
|
|
|
# Create bar with value at the correct position
|
|
bars = [
|
|
"[1--------]", # value 1
|
|
"[-2-------]", # value 2
|
|
"[--3------]", # value 3
|
|
"[---4-----]", # value 4
|
|
"[----5----]", # value 5
|
|
"[-----6---]", # value 6
|
|
"[------7--]", # value 7
|
|
"[-------8-]", # value 8
|
|
"[--------9]" # value 9
|
|
]
|
|
return bars[value - 1]
|
|
|
|
|
|
def format_gradient_line(term_left, term_right, value, left_width, right_width, center_width):
|
|
"""
|
|
Format a gradient line with proper spacing.
|
|
Example: " explicit < [|||||||||] > implicit "
|
|
"""
|
|
bar = format_gradient_bar(value)
|
|
# Right-align the left term, add the bar, then left-align the right term
|
|
line = f"{term_left.rjust(left_width)} < {bar} > {term_right.ljust(right_width)}"
|
|
return center_text(line, center_width)
|
|
|
|
|
|
def format_metadata_field(field_value, field_name):
|
|
"""
|
|
Format metadata field - show value if provided, otherwise show field name in brackets
|
|
"""
|
|
if field_value is None or field_value == "":
|
|
return f"[{field_name}]"
|
|
return str(field_value)
|
|
|
|
|
|
def generate_bicorder_text(json_data):
|
|
"""Generate the formatted bicorder text from JSON data"""
|
|
lines = []
|
|
|
|
# First pass: calculate maximum widths for left and right terms
|
|
# Also collect all terms for the glossary
|
|
max_left_width = 0
|
|
max_right_width = 0
|
|
glossary_terms = {} # Dictionary to store term: description mappings
|
|
|
|
# Check diagnostic gradients
|
|
for diagnostic_set in json_data.get("diagnostic", []):
|
|
for gradient in diagnostic_set.get("gradients", []):
|
|
term_left = gradient.get("term_left", "")
|
|
term_right = gradient.get("term_right", "")
|
|
max_left_width = max(max_left_width, len(term_left))
|
|
max_right_width = max(max_right_width, len(term_right))
|
|
|
|
# Collect terms for glossary
|
|
if term_left:
|
|
glossary_terms[term_left] = gradient.get("term_left_description", "")
|
|
if term_right:
|
|
glossary_terms[term_right] = gradient.get("term_right_description", "")
|
|
|
|
# Check analysis items
|
|
for analysis_item in json_data.get("analysis", []):
|
|
term_left = analysis_item.get("term_left", "")
|
|
term_right = analysis_item.get("term_right", "")
|
|
max_left_width = max(max_left_width, len(term_left))
|
|
max_right_width = max(max_right_width, len(term_right))
|
|
|
|
# Collect terms for glossary
|
|
if term_left:
|
|
glossary_terms[term_left] = analysis_item.get("term_left_description", "")
|
|
if term_right:
|
|
glossary_terms[term_right] = analysis_item.get("term_right_description", "")
|
|
|
|
# Calculate the width needed for centering
|
|
# Gradient line format: "{left_term} < [|||||||||] > {right_term}"
|
|
# That's: max_left_width + 3 + 11 + 3 + max_right_width
|
|
gradient_line_width = max_left_width + max_right_width + 17
|
|
|
|
# Also check metadata and headers
|
|
metadata = json_data.get("metadata", {})
|
|
max_text_width = max(
|
|
len("Protocol"),
|
|
len("BICORDER"),
|
|
len(format_metadata_field(metadata.get("protocol"), "Protocol")),
|
|
len(format_metadata_field(metadata.get("description"), "Description")),
|
|
len(format_metadata_field(metadata.get("analyst"), "Analyst")),
|
|
len(format_metadata_field(metadata.get("standpoint"), "Standpoint")),
|
|
len(format_metadata_field(metadata.get("timestamp"), "Timestamp")),
|
|
len("ANALYSIS")
|
|
)
|
|
|
|
# Check diagnostic set names
|
|
for diagnostic_set in json_data.get("diagnostic", []):
|
|
set_name = diagnostic_set.get("set_name", "").upper()
|
|
max_text_width = max(max_text_width, len(set_name))
|
|
|
|
# Use the maximum of gradient line width and text width
|
|
center_width = max(gradient_line_width, max_text_width)
|
|
|
|
# Header
|
|
lines.append(center_text("Protocol", center_width))
|
|
lines.append(center_text("BICORDER", center_width))
|
|
lines.append("")
|
|
|
|
# Metadata section
|
|
lines.append(center_text(format_metadata_field(metadata.get("protocol"), "Protocol"), center_width))
|
|
lines.append(center_text(format_metadata_field(metadata.get("description"), "Description"), center_width))
|
|
lines.append(center_text(format_metadata_field(metadata.get("analyst"), "Analyst"), center_width))
|
|
lines.append(center_text(format_metadata_field(metadata.get("standpoint"), "Standpoint"), center_width))
|
|
lines.append(center_text(format_metadata_field(metadata.get("timestamp"), "Timestamp"), center_width))
|
|
lines.append("")
|
|
|
|
# Diagnostic sections
|
|
for diagnostic_set in json_data.get("diagnostic", []):
|
|
set_name = diagnostic_set.get("set_name", "").upper()
|
|
lines.append(center_text(set_name, center_width))
|
|
|
|
for gradient in diagnostic_set.get("gradients", []):
|
|
term_left = gradient.get("term_left", "")
|
|
term_right = gradient.get("term_right", "")
|
|
value = gradient.get("value")
|
|
|
|
lines.append(format_gradient_line(term_left, term_right, value, max_left_width, max_right_width, center_width))
|
|
|
|
lines.append("")
|
|
|
|
# Analysis section
|
|
lines.append(center_text("ANALYSIS", center_width))
|
|
for analysis_item in json_data.get("analysis", []):
|
|
term_left = analysis_item.get("term_left", "")
|
|
term_right = analysis_item.get("term_right", "")
|
|
value = analysis_item.get("value")
|
|
|
|
lines.append(format_gradient_line(term_left, term_right, value, max_left_width, max_right_width, center_width))
|
|
|
|
lines.append("")
|
|
|
|
# Glossary section
|
|
lines.append(center_text("GLOSSARY", center_width))
|
|
lines.append("")
|
|
|
|
# Generate pandoc-compatible table
|
|
# Sort terms alphabetically (case-insensitive) for consistent output
|
|
sorted_terms = sorted(glossary_terms.items(), key=lambda x: x[0].lower())
|
|
|
|
if sorted_terms:
|
|
# Calculate column widths for the table
|
|
max_term_width = max(len(term) for term, _ in sorted_terms)
|
|
max_term_width = max(max_term_width, len("Term")) # At least as wide as header
|
|
|
|
# Build the table
|
|
# Header row
|
|
lines.append(f"| {'Term'.ljust(max_term_width)} | Description |")
|
|
# Separator row
|
|
lines.append(f"| {'-' * max_term_width} | {'-' * 11} |")
|
|
|
|
# Data rows
|
|
for term, description in sorted_terms:
|
|
lines.append(f"| {term.ljust(max_term_width)} | {description} |")
|
|
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main():
|
|
"""Main function to read JSON and generate text output"""
|
|
# Set up argument parser
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate formatted bicorder text from JSON input"
|
|
)
|
|
parser.add_argument(
|
|
"input_json",
|
|
help="Path to input JSON file"
|
|
)
|
|
parser.add_argument(
|
|
"output_txt",
|
|
help="Path to output TXT file"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Read the JSON file
|
|
try:
|
|
with open(args.input_json, "r") as f:
|
|
data = json.load(f)
|
|
except FileNotFoundError:
|
|
print(f"Error: Input file '{args.input_json}' not found.", file=sys.stderr)
|
|
sys.exit(1)
|
|
except json.JSONDecodeError as e:
|
|
print(f"Error: Invalid JSON in '{args.input_json}': {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Calculate automated analysis values
|
|
calculate_automated_analysis(data)
|
|
|
|
# Generate the formatted text
|
|
output = generate_bicorder_text(data)
|
|
|
|
# Write to output file
|
|
try:
|
|
with open(args.output_txt, "w") as f:
|
|
f.write(output)
|
|
print(f"Successfully generated '{args.output_txt}'")
|
|
except IOError as e:
|
|
print(f"Error: Could not write to '{args.output_txt}': {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|