Reorganize directory, add manual dataset and sync tooling

- Move all scripts to scripts/, web assets to web/, analysis results
  into self-contained data/readings/<type>_<YYYYMMDD>/ directories
- Add data/readings/manual_20260320/ with 32 JSON readings from
  git.medlab.host/ntnsndr/protocol-bicorder-data
- Add scripts/json_to_csv.py to convert bicorder JSON files to CSV
- Add scripts/sync_readings.sh for one-command sync + re-analysis of
  any dataset backed by a .sync_source config file
- Add scripts/classify_readings.py to apply the LDA classifier to all
  readings and save per-reading cluster assignments
- Add --min-coverage flag to multivariate_analysis.py for sparse/shortform
  datasets; also applies in lda_visualization.py
- Fix lda_visualization.py NaN handling and 0-d array annotation bug
- Update README.md and WORKFLOW.md to document datasets, sync workflow,
  shortform handling, and new scripts

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Nathan Schneider
2026-03-20 17:35:13 -06:00
parent 0c794dddae
commit 897c30406b
545 changed files with 10715 additions and 718 deletions

View File

@@ -0,0 +1,155 @@
#!/usr/bin/env python3
"""
Protocol Bicorder Analysis Script
Processes a two-column CSV file (protocol descriptor and description) and adds
columns for each diagnostic gradient from bicorder.json. Values to be filled
by LLM commands.
"""
import csv
import json
import sys
import argparse
from pathlib import Path
def load_bicorder_config(bicorder_path):
"""Load and parse the bicorder.json configuration file."""
with open(bicorder_path, 'r') as f:
return json.load(f)
def extract_gradients(bicorder_data):
"""Extract all gradients from the diagnostic sets."""
gradients = []
for diagnostic_set in bicorder_data['diagnostic']:
set_name = diagnostic_set['set_name']
for gradient in diagnostic_set['gradients']:
# Create a unique column name for this gradient
col_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
gradients.append({
'column_name': col_name,
'set_name': set_name,
'term_left': gradient['term_left'],
'term_left_description': gradient['term_left_description'],
'term_right': gradient['term_right'],
'term_right_description': gradient['term_right_description']
})
return gradients
def process_csv(input_csv, output_csv, bicorder_path, analyst=None, standpoint=None):
"""
Process the input CSV and add gradient columns.
Args:
input_csv: Path to input CSV file
output_csv: Path to output CSV file
bicorder_path: Path to bicorder.json file
analyst: Optional analyst name
standpoint: Optional standpoint description
"""
# Load bicorder configuration
bicorder_data = load_bicorder_config(bicorder_path)
gradients = extract_gradients(bicorder_data)
with open(input_csv, 'r', encoding='utf-8') as infile, \
open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
reader = csv.DictReader(infile)
# Get original fieldnames from input CSV, filter out None/empty
original_fields = [f for f in reader.fieldnames if f and f.strip()]
# Add gradient columns and metadata columns
gradient_columns = [g['column_name'] for g in gradients]
output_fields = list(original_fields) + gradient_columns
# Add metadata columns if provided
if analyst is not None:
output_fields.append('analyst')
if standpoint is not None:
output_fields.append('standpoint')
writer = csv.DictWriter(outfile, fieldnames=output_fields)
writer.writeheader()
# Process each protocol row
row_count = 0
for protocol_row in reader:
# Start with original row data, filter out None keys
output_row = {k: v for k, v in protocol_row.items() if k and k.strip()}
# Initialize all gradient columns as empty (to be filled by LLM)
for gradient in gradients:
output_row[gradient['column_name']] = ''
# Add metadata if provided
if analyst is not None:
output_row['analyst'] = analyst
if standpoint is not None:
output_row['standpoint'] = standpoint
writer.writerow(output_row)
row_count += 1
descriptor = protocol_row.get('Descriptor', '').strip()
print(f"Processed protocol {row_count}: {descriptor}")
print(f"\nOutput written to: {output_csv}")
print(f"Total protocols: {row_count}")
print(f"Gradient columns added: {len(gradients)}")
print(f"\nGradient columns:")
for i, gradient in enumerate(gradients, 1):
print(f" {i}. {gradient['column_name']}")
def main():
parser = argparse.ArgumentParser(
description='Process protocol CSV and add bicorder diagnostic columns',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Example usage:
python3 bicorder_analyze.py protocols_edited.csv -o output.csv
python3 bicorder_analyze.py protocols_raw.csv -o output.csv -a "Jane Doe" -s "Researcher perspective"
The script will preserve all original columns and add one column per diagnostic gradient.
Each gradient column will be empty, ready to be filled by LLM commands.
"""
)
parser.add_argument('input_csv', help='Input CSV file with protocol data')
parser.add_argument('-o', '--output', required=True, help='Output CSV file')
parser.add_argument('-b', '--bicorder',
default='../bicorder.json',
help='Path to bicorder.json (default: ../bicorder.json)')
parser.add_argument('-a', '--analyst', help='Analyst name (adds analyst column)')
parser.add_argument('-s', '--standpoint', help='Analyst standpoint (adds standpoint column)')
args = parser.parse_args()
# Validate input file exists
if not Path(args.input_csv).exists():
print(f"Error: Input file '{args.input_csv}' not found", file=sys.stderr)
sys.exit(1)
# Validate bicorder.json exists
if not Path(args.bicorder).exists():
print(f"Error: Bicorder config '{args.bicorder}' not found", file=sys.stderr)
sys.exit(1)
# Process the CSV
process_csv(
args.input_csv,
args.output,
args.bicorder,
args.analyst,
args.standpoint
)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,175 @@
#!/usr/bin/env python3
"""
Batch process all protocols in a CSV using the Bicorder framework.
This script orchestrates the entire analysis workflow:
1. Creates output CSV with gradient columns
2. For each protocol row:
- Queries all 23 gradients (each in a new chat)
- Updates CSV with results
"""
import csv
import json
import sys
import argparse
import subprocess
from pathlib import Path
def count_csv_rows(csv_path):
"""Count the number of data rows in a CSV file."""
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
return sum(1 for _ in reader)
def run_bicorder_analyze(input_csv, output_csv, bicorder_path, analyst=None, standpoint=None):
"""Run bicorder_analyze.py to create output CSV."""
cmd = ['python3', str(Path(__file__).parent / 'bicorder_analyze.py'), input_csv, '-o', output_csv, '-b', bicorder_path]
if analyst:
cmd.extend(['-a', analyst])
if standpoint:
cmd.extend(['-s', standpoint])
print(f"Creating analysis CSV: {output_csv}")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"Error creating CSV: {result.stderr}", file=sys.stderr)
return False
print(result.stdout)
return True
def query_gradients(output_csv, row_num, bicorder_path, model=None):
"""Query all gradients for a protocol row."""
cmd = ['python3', str(Path(__file__).parent / 'bicorder_query.py'), output_csv, str(row_num),
'-b', bicorder_path]
if model:
cmd.extend(['-m', model])
print(f"Starting gradient queries...")
# Don't capture output - let it print in real-time for progress visibility
result = subprocess.run(cmd)
if result.returncode != 0:
print(f"Error querying gradients", file=sys.stderr)
return False
return True
def process_protocol_row(input_csv, output_csv, row_num, total_rows, bicorder_path, model=None):
"""Process a single protocol row through the complete workflow."""
print(f"\n{'='*60}")
print(f"Row {row_num}/{total_rows}")
print(f"{'='*60}")
# Query all gradients (each gradient gets a new chat)
if not query_gradients(output_csv, row_num, bicorder_path, model):
print(f"[FAILED] Could not query gradients")
return False
print(f"✓ Row {row_num} complete")
return True
def main():
parser = argparse.ArgumentParser(
description='Batch process protocols through Bicorder analysis (each gradient uses a new chat)',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Example usage:
# Process all protocols
python3 bicorder_batch.py data/readings/synthetic_20251116/protocols_edited.csv -o data/readings/synthetic_20251116/readings.csv
# Process specific rows
python3 bicorder_batch.py data/readings/synthetic_20251116/protocols_edited.csv -o data/readings/synthetic_20251116/readings.csv --start 1 --end 5
# With specific model
python3 bicorder_batch.py data/readings/synthetic_20251116/protocols_edited.csv -o data/readings/synthetic_20251116/readings.csv -m mistral
# With metadata
python3 bicorder_batch.py data/readings/synthetic_20251116/protocols_edited.csv -o data/readings/synthetic_20251116/readings.csv -a "Your Name" -s "Your standpoint"
"""
)
parser.add_argument('input_csv', help='Input CSV file with protocol data')
parser.add_argument('-o', '--output', required=True, help='Output CSV file')
parser.add_argument('-b', '--bicorder',
default='../bicorder.json',
help='Path to bicorder.json (default: ../bicorder.json)')
parser.add_argument('-m', '--model', help='LLM model to use')
parser.add_argument('-a', '--analyst', help='Analyst name')
parser.add_argument('-s', '--standpoint', help='Analyst standpoint')
parser.add_argument('--start', type=int, default=1,
help='Start row number (1-indexed, default: 1)')
parser.add_argument('--end', type=int,
help='End row number (1-indexed, default: all rows)')
parser.add_argument('--resume', action='store_true',
help='Resume from existing output CSV (skip rows with values)')
args = parser.parse_args()
# Validate input file exists
if not Path(args.input_csv).exists():
print(f"Error: Input file '{args.input_csv}' not found", file=sys.stderr)
sys.exit(1)
# Validate bicorder.json exists
if not Path(args.bicorder).exists():
print(f"Error: Bicorder config '{args.bicorder}' not found", file=sys.stderr)
sys.exit(1)
# Count rows in input CSV
total_rows = count_csv_rows(args.input_csv)
end_row = args.end if args.end else total_rows
if args.start > total_rows or end_row > total_rows:
print(f"Error: Row range exceeds CSV size ({total_rows} rows)", file=sys.stderr)
sys.exit(1)
print(f"Bicorder Batch Analysis")
print(f"Input: {args.input_csv} ({total_rows} protocols)")
print(f"Output: {args.output}")
print(f"Processing rows: {args.start} to {end_row}")
if args.model:
print(f"Model: {args.model}")
print()
# Step 1: Create output CSV (unless resuming)
if not args.resume or not Path(args.output).exists():
if not run_bicorder_analyze(args.input_csv, args.output, args.bicorder,
args.analyst, args.standpoint):
sys.exit(1)
else:
print(f"Resuming from existing CSV: {args.output}")
# Step 2: Process each protocol row
success_count = 0
fail_count = 0
for row_num in range(args.start, end_row + 1):
if process_protocol_row(args.input_csv, args.output, row_num, end_row,
args.bicorder, args.model):
success_count += 1
else:
fail_count += 1
print(f"[WARNING] Row {row_num} failed, continuing...")
# Summary
print(f"\n{'='*60}")
print(f"BATCH COMPLETE")
print(f"{'='*60}")
print(f"Successful: {success_count}")
print(f"Failed: {fail_count}")
print(f"Output: {args.output}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,364 @@
#!/usr/bin/env python3
"""
Bicorder Cluster Classifier
Provides real-time protocol classification and smart form recommendation
based on the two-cluster analysis.
Usage:
from bicorder_classifier import BicorderClassifier
classifier = BicorderClassifier()
# As user fills in dimensions
ratings = {
'Design_explicit_vs_implicit': 7,
'Design_elite_vs_vernacular': 2,
# ... etc
}
result = classifier.predict(ratings)
print(f"Cluster: {result['cluster']}")
print(f"Confidence: {result['confidence']:.1%}")
print(f"Recommend form: {result['recommended_form']}")
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import json
from pathlib import Path
# Path to bicorder.json (relative to this script)
_BICORDER_JSON = Path(__file__).parent.parent.parent / 'bicorder.json'
# Historical column renames: maps old CSV column names → current bicorder.json names.
# Add an entry here whenever gradient terms are renamed in bicorder.json.
_COLUMN_RENAMES = {
'Design_elite_vs_vernacular': 'Design_institutional_vs_vernacular',
'Entanglement_exclusive_vs_non-exclusive': 'Entanglement_monopolistic_vs_pluralistic',
'Experience_sufficient_vs_insufficient': 'Experience_sufficient_vs_limited',
'Experience_Kafka_vs_Whitehead': 'Experience_restraining_vs_liberating',
}
def _load_bicorder_dimensions(bicorder_path=_BICORDER_JSON):
"""Read DIMENSIONS and KEY_DIMENSIONS from bicorder.json."""
with open(bicorder_path) as f:
data = json.load(f)
dimensions = []
key_dimensions = []
for category in data['diagnostic']:
set_name = category['set_name']
for gradient in category['gradients']:
dim_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
dimensions.append(dim_name)
if gradient.get('shortform', False):
key_dimensions.append(dim_name)
return dimensions, key_dimensions
class BicorderClassifier:
"""
Classifies protocols into one of two families and recommends form type.
"""
# Cluster names
CLUSTER_NAMES = {
1: "Relational/Cultural",
2: "Institutional/Bureaucratic"
}
def __init__(self, diagnostic_csv='data/readings/synthetic_20251116/readings.csv',
model_path=None):
"""Initialize classifier with pre-computed model data."""
if model_path is None:
model_path = str(Path(diagnostic_csv).parent / 'analysis' / 'data')
self._diagnostic_csv = diagnostic_csv
self.model_path = Path(model_path)
self.scaler = StandardScaler()
self.lda = None
self.cluster_centroids = None
# Derive dimension lists from bicorder.json
self.DIMENSIONS, self.KEY_DIMENSIONS = _load_bicorder_dimensions()
# Load training data to fit scaler and LDA
self._load_model()
def _load_model(self):
"""Load and fit the classification model from analysis results."""
# Load the original data and cluster assignments
df = pd.read_csv(self._diagnostic_csv)
clusters = pd.read_csv(self.model_path / 'kmeans_clusters.csv')
# Rename old column names to match current bicorder.json
df = df.rename(columns=_COLUMN_RENAMES)
# Remove duplicates
df = df.drop_duplicates(subset='Descriptor', keep='first')
# Merge and clean
merged = df.merge(clusters, on='Descriptor')
merged_clean = merged.dropna(subset=self.DIMENSIONS)
# Prepare training data
X = merged_clean[self.DIMENSIONS].values
y = merged_clean['cluster'].values
# Fit scaler
self.scaler.fit(X)
X_scaled = self.scaler.transform(X)
# Fit LDA
self.lda = LinearDiscriminantAnalysis(n_components=1)
self.lda.fit(X_scaled, y)
# Calculate cluster centroids in scaled space
self.cluster_centroids = {}
for cluster_id in [1, 2]:
cluster_data = X_scaled[y == cluster_id]
self.cluster_centroids[cluster_id] = cluster_data.mean(axis=0)
def predict(self, ratings, return_details=True):
"""
Predict cluster for given ratings.
Args:
ratings: Dict mapping dimension names to values (1-9)
Can be partial - missing dimensions are filled with median
return_details: If True, returns detailed information
Returns:
Dict with:
- cluster: Predicted cluster number (1 or 2)
- cluster_name: Human-readable cluster name
- confidence: Confidence score (0-1)
- completeness: Fraction of dimensions provided (0-1)
- recommended_form: 'short' or 'long'
- distance_to_boundary: How far from cluster boundary
- lda_score: Score on the discriminant axis
"""
# Convert ratings to full vector
X = np.full(len(self.DIMENSIONS), np.nan)
provided_count = 0
for i, dim in enumerate(self.DIMENSIONS):
if dim in ratings:
X[i] = ratings[dim]
provided_count += 1
completeness = provided_count / len(self.DIMENSIONS)
# Fill missing values with median (5 - middle of 1-9 scale)
X[np.isnan(X)] = 5.0
# Scale
X_scaled = self.scaler.transform(X.reshape(1, -1))
# Predict cluster
cluster = self.lda.predict(X_scaled)[0]
# Get LDA score (position on discriminant axis)
lda_score = self.lda.decision_function(X_scaled)[0]
# Calculate confidence based on distance from decision boundary
# LDA decision boundary is at 0
distance_to_boundary = abs(lda_score)
# Confidence: higher when further from boundary
# Normalize based on observed data range
confidence = min(1.0, distance_to_boundary / 3.0) # 3.0 is typical strong separation
# Adjust confidence based on completeness
adjusted_confidence = confidence * (0.5 + 0.5 * completeness)
# Recommend form
# Use long form when:
# 1. Low confidence (< 0.6)
# 2. Low completeness (< 0.5 of dimensions provided)
# 3. Near boundary (< 0.5 distance)
if adjusted_confidence < 0.6 or completeness < 0.5 or distance_to_boundary < 0.5:
recommended_form = 'long'
else:
recommended_form = 'short'
if not return_details:
return {
'cluster': int(cluster),
'cluster_name': self.CLUSTER_NAMES[cluster],
'confidence': float(adjusted_confidence),
'recommended_form': recommended_form
}
# Calculate distances to each centroid
distances = {}
for cluster_id, centroid in self.cluster_centroids.items():
dist = np.linalg.norm(X_scaled - centroid)
distances[cluster_id] = float(dist)
return {
'cluster': int(cluster),
'cluster_name': self.CLUSTER_NAMES[cluster],
'confidence': float(adjusted_confidence),
'completeness': float(completeness),
'dimensions_provided': provided_count,
'dimensions_total': len(self.DIMENSIONS),
'recommended_form': recommended_form,
'distance_to_boundary': float(distance_to_boundary),
'lda_score': float(lda_score),
'distances_to_centroids': distances,
'key_dimensions_provided': sum(1 for dim in self.KEY_DIMENSIONS if dim in ratings),
'key_dimensions_total': len(self.KEY_DIMENSIONS),
}
def get_key_dimensions(self):
"""Return the most important dimensions for classification."""
return self.KEY_DIMENSIONS.copy()
def get_short_form_dimensions(self):
"""Return recommended dimensions for short form."""
return self.KEY_DIMENSIONS
def explain_classification(self, ratings):
"""
Provide human-readable explanation of classification.
Args:
ratings: Dict mapping dimension names to values
Returns:
String explanation
"""
result = self.predict(ratings, return_details=True)
explanation = []
explanation.append(f"Protocol Classification: {result['cluster_name']}")
explanation.append(f"Confidence: {result['confidence']:.0%}")
explanation.append(f"")
if result['lda_score'] > 0:
explanation.append(f"This protocol leans toward Institutional/Bureaucratic characteristics:")
explanation.append(f" - More likely to be formal, standardized, top-down")
explanation.append(f" - May involve state/corporate enforcement")
explanation.append(f" - Tends toward precise, documented procedures")
else:
explanation.append(f"This protocol leans toward Relational/Cultural characteristics:")
explanation.append(f" - More likely to be emergent, community-based")
explanation.append(f" - May involve voluntary participation")
explanation.append(f" - Tends toward interpretive, flexible practices")
explanation.append(f"")
explanation.append(f"Distance from boundary: {result['distance_to_boundary']:.2f}")
if result['distance_to_boundary'] < 0.5:
explanation.append(f"⚠️ This protocol is near the boundary between families.")
explanation.append(f" It may exhibit characteristics of both types.")
explanation.append(f"")
explanation.append(f"Completeness: {result['completeness']:.0%} ({result['dimensions_provided']}/{result['dimensions_total']} dimensions)")
if result['completeness'] < 1.0:
explanation.append(f"Note: Missing dimensions filled with neutral values (5)")
explanation.append(f" Confidence improves with complete data")
explanation.append(f"")
explanation.append(f"Recommended form: {result['recommended_form'].upper()}")
if result['recommended_form'] == 'long':
explanation.append(f"Reason: Use long form for:")
if result['confidence'] < 0.6:
explanation.append(f" - Low classification confidence")
if result['completeness'] < 0.5:
explanation.append(f" - Incomplete data")
if result['distance_to_boundary'] < 0.5:
explanation.append(f" - Ambiguous positioning between families")
else:
explanation.append(f"Reason: High confidence classification with {result['completeness']:.0%} data")
return "\n".join(explanation)
def save_model(self, output_path='bicorder_classifier_model.json'):
"""Save model parameters for use without scikit-learn."""
model_data = {
'dimensions': self.DIMENSIONS,
'key_dimensions': self.KEY_DIMENSIONS,
'cluster_names': self.CLUSTER_NAMES,
'scaler_mean': self.scaler.mean_.tolist(),
'scaler_std': self.scaler.scale_.tolist(),
'lda_coef': self.lda.coef_.tolist(),
'lda_intercept': self.lda.intercept_.tolist(),
'cluster_centroids': {
str(k): v.tolist() for k, v in self.cluster_centroids.items()
}
}
with open(output_path, 'w') as f:
json.dump(model_data, f, indent=2)
print(f"Model saved to {output_path}")
return output_path
def main():
"""Demo usage of the classifier."""
print("=" * 80)
print("BICORDER CLUSTER CLASSIFIER - DEMO")
print("=" * 80)
classifier = BicorderClassifier()
# Example 1: Relational/Cultural protocol (e.g., Indigenous knowledge sharing)
print("\nExample 1: Community-Based Protocol")
print("-" * 80)
ratings_relational = {
'Design_elite_vs_vernacular': 9, # Very vernacular
'Design_explicit_vs_implicit': 8, # More implicit
'Entanglement_flocking_vs_swarming': 9, # Swarming
'Entanglement_obligatory_vs_voluntary': 9, # Voluntary
'Design_static_vs_malleable': 8, # Malleable
'Design_technical_vs_social': 9, # Social
}
print(classifier.explain_classification(ratings_relational))
# Example 2: Institutional protocol (e.g., Airport security)
print("\n\n" + "=" * 80)
print("Example 2: Institutional Protocol")
print("-" * 80)
ratings_institutional = {
'Design_elite_vs_vernacular': 1, # Elite
'Design_explicit_vs_implicit': 1, # Very explicit
'Entanglement_flocking_vs_swarming': 1, # Flocking
'Entanglement_obligatory_vs_voluntary': 1, # Obligatory
'Design_static_vs_malleable': 2, # Static
'Design_technical_vs_social': 2, # Technical
'Entanglement_sovereign_vs_subsidiary': 1, # Sovereign
}
print(classifier.explain_classification(ratings_institutional))
# Example 3: Ambiguous/boundary protocol
print("\n\n" + "=" * 80)
print("Example 3: Boundary Protocol (mixed characteristics)")
print("-" * 80)
ratings_boundary = {
'Design_elite_vs_vernacular': 5, # Middle
'Design_explicit_vs_implicit': 4, # Slightly implicit
'Entanglement_flocking_vs_swarming': 5, # Middle
'Entanglement_obligatory_vs_voluntary': 6, # Slightly voluntary
}
print(classifier.explain_classification(ratings_boundary))
# Save model
print("\n\n" + "=" * 80)
classifier.save_model()
print("\nKey dimensions for short form:")
for dim in classifier.get_key_dimensions():
print(f" - {dim}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,95 @@
#!/usr/bin/env python3
"""
Initialize LLM conversation with bicorder framework and protocol context.
This script reads a protocol from the CSV and the bicorder.json framework,
then generates a prompt to initialize the LLM conversation.
"""
import csv
import json
import sys
import argparse
from pathlib import Path
def load_bicorder_config(bicorder_path):
"""Load and parse the bicorder.json configuration file."""
with open(bicorder_path, 'r') as f:
return json.load(f)
def get_protocol_by_row(csv_path, row_number):
"""Get protocol data from CSV by row number (1-indexed)."""
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader, start=1):
if i == row_number:
return {
'descriptor': row.get('Descriptor', '').strip(),
'description': row.get('Description', '').strip()
}
return None
def generate_init_prompt(protocol, bicorder_data):
"""Generate the initialization prompt for the LLM."""
# Ultra-minimal version for system prompt
prompt = f"""Analyze this protocol: "{protocol['descriptor']}"
Description: {protocol['description']}
Task: Rate this protocol on diagnostic gradients using scale 1-9 (1=left term, 5=neutral/balanced, 9=right term). Respond with just the number and brief explanation."""
return prompt
def main():
parser = argparse.ArgumentParser(
description='Initialize LLM conversation with protocol and bicorder framework',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Example usage:
# Initialize conversation for protocol in row 1
python3 bicorder_init.py protocols_edited.csv 1 | llm -m mistral --save init_1
# Initialize for row 5
python3 bicorder_init.py protocols_edited.csv 5 | llm -m mistral --save init_5
"""
)
parser.add_argument('input_csv', help='Input CSV file with protocol data')
parser.add_argument('row_number', type=int, help='Row number to analyze (1-indexed)')
parser.add_argument('-b', '--bicorder',
default='../bicorder.json',
help='Path to bicorder.json (default: ../bicorder.json)')
args = parser.parse_args()
# Validate input file exists
if not Path(args.input_csv).exists():
print(f"Error: Input file '{args.input_csv}' not found", file=sys.stderr)
sys.exit(1)
# Validate bicorder.json exists
if not Path(args.bicorder).exists():
print(f"Error: Bicorder config '{args.bicorder}' not found", file=sys.stderr)
sys.exit(1)
# Load protocol
protocol = get_protocol_by_row(args.input_csv, args.row_number)
if protocol is None:
print(f"Error: Row {args.row_number} not found in CSV", file=sys.stderr)
sys.exit(1)
# Load bicorder config
bicorder_data = load_bicorder_config(args.bicorder)
# Generate and output prompt
prompt = generate_init_prompt(protocol, bicorder_data)
print(prompt)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,230 @@
#!/usr/bin/env python3
"""
Query LLM for individual gradient values and update CSV.
This script generates prompts for each gradient, queries the LLM conversation,
and updates the CSV with the returned values.
"""
import csv
import json
import sys
import argparse
import subprocess
import re
from pathlib import Path
def load_bicorder_config(bicorder_path):
"""Load and parse the bicorder.json configuration file."""
with open(bicorder_path, 'r') as f:
return json.load(f)
def extract_gradients(bicorder_data):
"""Extract all gradients from the diagnostic sets."""
gradients = []
for diagnostic_set in bicorder_data['diagnostic']:
set_name = diagnostic_set['set_name']
for gradient in diagnostic_set['gradients']:
col_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
gradients.append({
'column_name': col_name,
'set_name': set_name,
'term_left': gradient['term_left'],
'term_left_description': gradient['term_left_description'],
'term_right': gradient['term_right'],
'term_right_description': gradient['term_right_description']
})
return gradients
def get_protocol_by_row(csv_path, row_number):
"""Get protocol data from CSV by row number (1-indexed)."""
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader, start=1):
if i == row_number:
return {
'descriptor': row.get('Descriptor', '').strip(),
'description': row.get('Description', '').strip()
}
return None
def generate_gradient_prompt(protocol_descriptor, protocol_description, gradient):
"""Generate a prompt for a single gradient evaluation."""
return f"""Analyze this protocol: "{protocol_descriptor}"
Description: {protocol_description}
Evaluate the protocol on this gradient:
**{gradient['term_left']}** (1) vs **{gradient['term_right']}** (9)
- **{gradient['term_left']}**: {gradient['term_left_description']}
- **{gradient['term_right']}**: {gradient['term_right_description']}
Provide a rating from 1 to 9, where:
- 1 = strongly {gradient['term_left']}
- 5 = neutral/balanced/not applicable
- 9 = strongly {gradient['term_right']}
Respond with ONLY the number (1-9), optionally followed by a brief explanation.
"""
def query_llm(prompt, model=None):
"""Send prompt to llm CLI and get response."""
cmd = ['llm']
if model:
cmd.extend(['-m', model])
try:
result = subprocess.run(
cmd,
input=prompt,
text=True,
capture_output=True,
check=True
)
return result.stdout.strip()
except subprocess.CalledProcessError as e:
print(f" Error calling llm: {e.stderr}", file=sys.stderr)
return None
def extract_value(llm_response):
"""Extract numeric value (1-9) from LLM response."""
# Look for a number 1-9 at the start of the response
match = re.search(r'^(\d)', llm_response.strip())
if match:
value = int(match.group(1))
if 1 <= value <= 9:
return value
return None
def update_csv_cell(csv_path, row_number, column_name, value):
"""Update a specific cell in the CSV."""
# Read all rows
rows = []
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
fieldnames = reader.fieldnames
for row in reader:
rows.append(row)
# Update the specific cell
if row_number <= len(rows):
rows[row_number - 1][column_name] = str(value)
# Write back
with open(csv_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
return True
return False
def main():
parser = argparse.ArgumentParser(
description='Query LLM for gradient values and update CSV',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Example usage:
# Query all gradients for protocol in row 1
python3 bicorder_query.py analysis_output.csv 1
# Query specific model
python3 bicorder_query.py analysis_output.csv 1 -m mistral
# Dry run (show prompts without calling LLM)
python3 bicorder_query.py analysis_output.csv 1 --dry-run
"""
)
parser.add_argument('csv_path', help='CSV file to update')
parser.add_argument('row_number', type=int, help='Row number to analyze (1-indexed)')
parser.add_argument('-b', '--bicorder',
default='../bicorder.json',
help='Path to bicorder.json (default: ../bicorder.json)')
parser.add_argument('-m', '--model', help='LLM model to use')
parser.add_argument('--dry-run', action='store_true',
help='Show prompts without calling LLM or updating CSV')
args = parser.parse_args()
# Validate files exist
if not Path(args.csv_path).exists():
print(f"Error: CSV file '{args.csv_path}' not found", file=sys.stderr)
sys.exit(1)
if not Path(args.bicorder).exists():
print(f"Error: Bicorder config '{args.bicorder}' not found", file=sys.stderr)
sys.exit(1)
# Load protocol data
protocol = get_protocol_by_row(args.csv_path, args.row_number)
if protocol is None:
print(f"Error: Row {args.row_number} not found in CSV", file=sys.stderr)
sys.exit(1)
# Load bicorder config
bicorder_data = load_bicorder_config(args.bicorder)
gradients = extract_gradients(bicorder_data)
if args.dry_run:
print(f"DRY RUN: Row {args.row_number}, {len(gradients)} gradients")
print(f"Protocol: {protocol['descriptor']}\n")
else:
print(f"Protocol: {protocol['descriptor']}")
print(f"Loaded {len(gradients)} gradients, starting queries...")
# Process each gradient
for i, gradient in enumerate(gradients, 1):
gradient_short = gradient['column_name'].replace('_', ' ')
if not args.dry_run:
print(f"[{i}/{len(gradients)}] Querying: {gradient_short}...", flush=True)
# Generate prompt (including protocol context)
prompt = generate_gradient_prompt(
protocol['descriptor'],
protocol['description'],
gradient
)
if args.dry_run:
print(f"[{i}/{len(gradients)}] {gradient_short}")
print(f"Prompt:\n{prompt}\n")
continue
# Query LLM (new chat each time)
response = query_llm(prompt, args.model)
if response is None:
print(f"[{i}/{len(gradients)}] {gradient_short}: FAILED")
continue
# Extract value
value = extract_value(response)
if value is None:
print(f"[{i}/{len(gradients)}] {gradient_short}: WARNING - no valid value")
continue
# Update CSV
if update_csv_cell(args.csv_path, args.row_number, gradient['column_name'], value):
print(f"[{i}/{len(gradients)}] {gradient_short}: {value}")
else:
print(f"[{i}/{len(gradients)}] {gradient_short}: ERROR updating CSV")
if not args.dry_run:
print(f"\n✓ CSV updated: {args.csv_path}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,8 @@
#!/bin/bash
prompt="Return csv-formatted data (with no markdown wrapper) that consists of a list of protocols discussed or referred to in the attached text. Protocols are defined extremely broadly as 'patterns of interaction,' and may be of a nontechnical nature. Protocols should be as specific as possible, such as 'Sacrament of Reconciliation' rather than 'Religious Protocols.' The first column should provide a brief descriptor of the protocol, and the second column should describe it in a substantial paragraph of 3-5 sentences, encapsulated in quotation marks to avoid breaking on commas. Be sure to paraphrase rather than quoting directly from the source text."
for file in "$@"; do
llm -m gemma3:12b -f $file "$prompt" >> output.csv
echo "Completed $file"
done

View File

@@ -0,0 +1,102 @@
#!/usr/bin/env python3
"""
Apply the BicorderClassifier to all readings in a CSV and save results.
Uses the synthetic-trained LDA model by default. Missing dimensions are
filled with the neutral value (5), so shortform readings can still be
classified — though with lower confidence.
Usage:
python3 scripts/classify_readings.py data/readings/manual_20260320/readings.csv
python3 scripts/classify_readings.py data/readings/manual_20260320/readings.csv \\
--training data/readings/synthetic_20251116/readings.csv \\
--output data/readings/manual_20260320/analysis/classifications.csv
"""
import argparse
import csv
from pathlib import Path
import pandas as pd
from bicorder_classifier import BicorderClassifier
def main():
parser = argparse.ArgumentParser(
description='Classify all readings in a CSV using the BicorderClassifier'
)
parser.add_argument('input_csv', help='Readings CSV to classify')
parser.add_argument(
'--training',
default='data/readings/synthetic_20251116/readings.csv',
help='Training CSV for classifier (default: synthetic_20251116)'
)
parser.add_argument(
'--output', default=None,
help='Output CSV path (default: <dataset>/analysis/classifications.csv)'
)
args = parser.parse_args()
input_path = Path(args.input_csv)
output_path = (
Path(args.output) if args.output
else input_path.parent / 'analysis' / 'classifications.csv'
)
output_path.parent.mkdir(parents=True, exist_ok=True)
print(f"Loading classifier (training: {args.training})...")
classifier = BicorderClassifier(diagnostic_csv=args.training)
df = pd.read_csv(input_path)
print(f"Classifying {len(df)} readings from {input_path}...")
rows = []
for _, record in df.iterrows():
# Build ratings dict from dimension columns only
ratings = {
col: float(record[col])
for col in classifier.DIMENSIONS
if col in record and pd.notna(record[col])
}
result = classifier.predict(ratings, return_details=True)
rows.append({
'Descriptor': record.get('Descriptor', ''),
'analyst': record.get('analyst', ''),
'standpoint': record.get('standpoint', ''),
'shortform': record.get('shortform', ''),
'cluster': result['cluster'],
'cluster_name': result['cluster_name'],
'confidence': round(result['confidence'], 3),
'lda_score': round(result['lda_score'], 3),
'distance_to_boundary': round(result['distance_to_boundary'], 3),
'completeness': round(result['completeness'], 3),
'dimensions_provided': result['dimensions_provided'],
'key_dims_provided': result['key_dimensions_provided'],
'recommended_form': result['recommended_form'],
})
out_df = pd.DataFrame(rows)
out_df.to_csv(output_path, index=False)
print(f"Classifications saved → {output_path}")
# Summary
counts = out_df['cluster_name'].value_counts()
print(f"\nCluster summary:")
for name, count in counts.items():
pct = count / len(out_df) * 100
print(f" {name}: {count} ({pct:.0f}%)")
low_conf = (out_df['confidence'] < 0.4).sum()
if low_conf:
print(f"\n {low_conf} readings with low confidence (<0.4) — may be boundary cases")
shortform_count = out_df[out_df['shortform'].astype(str) == 'True'].shape[0]
if shortform_count:
print(f"\n {shortform_count} shortform readings classified (missing dims filled with neutral 5)")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,186 @@
#!/usr/bin/env python3
"""
Compare multiple analysis CSV files to determine which most closely resembles a reference file.
Uses Euclidean distance, correlation, and RMSE metrics.
"""
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from pathlib import Path
def calculate_euclidean_distance(df1, df2, numeric_cols):
"""Calculate Euclidean distance between two dataframes."""
distances = []
for idx in df1.index:
diff = df1.loc[idx, numeric_cols] - df2.loc[idx, numeric_cols]
# Use nansum to ignore NaN values
distance = np.sqrt(np.nansum(diff ** 2))
distances.append(distance)
return np.array(distances)
def calculate_rmse(df1, df2, numeric_cols):
"""Calculate Root Mean Squared Error."""
diff = df1[numeric_cols] - df2[numeric_cols]
# Use nanmean to ignore NaN values
mse = np.nanmean(diff.values ** 2)
return np.sqrt(mse)
def calculate_correlation(df1, df2, numeric_cols):
"""Calculate Pearson correlation across all numeric values."""
vals1 = df1[numeric_cols].values.flatten()
vals2 = df2[numeric_cols].values.flatten()
# Remove NaN values - only use positions where both have valid values
mask = ~(np.isnan(vals1) | np.isnan(vals2))
vals1_clean = vals1[mask]
vals2_clean = vals2[mask]
if len(vals1_clean) < 2:
return np.nan, np.nan
corr, pvalue = pearsonr(vals1_clean, vals2_clean)
return corr, pvalue
def compare_analyses(reference_file, comparison_files):
"""Compare multiple analysis files to a reference file."""
# Read reference file
print(f"Reading reference file: {reference_file}")
ref_df = pd.read_csv(reference_file, quotechar='"', escapechar='\\', engine='python')
# Get numeric columns (all the rating dimensions)
numeric_cols = [col for col in ref_df.columns if
col.startswith(('Design_', 'Entanglement_', 'Experience_'))]
# Convert numeric columns to numeric type, coercing errors to NaN
for col in numeric_cols:
ref_df[col] = pd.to_numeric(ref_df[col], errors='coerce')
print(f"\nFound {len(numeric_cols)} numeric dimensions to compare")
print(f"Comparing {len(ref_df)} protocols\n")
print("="*80)
results = {}
for comp_file in comparison_files:
print(f"\nComparing: {Path(comp_file).name}")
print("-"*80)
# Read comparison file
comp_df = pd.read_csv(comp_file, quotechar='"', escapechar='\\', engine='python')
# Convert numeric columns to numeric type, coercing errors to NaN
for col in numeric_cols:
comp_df[col] = pd.to_numeric(comp_df[col], errors='coerce')
# Ensure same protocols in same order (match by Descriptor)
if 'Descriptor' in ref_df.columns and 'Descriptor' in comp_df.columns:
# Use merge to ensure exact matching - only keep protocols in ref_df
comp_df = pd.merge(
ref_df[['Descriptor']],
comp_df,
on='Descriptor',
how='left'
)
# Calculate Euclidean distances using reset indices to ensure alignment
ref_temp = ref_df.reset_index(drop=True)
comp_temp = comp_df.reset_index(drop=True)
euclidean_distances = calculate_euclidean_distance(ref_temp, comp_temp, numeric_cols)
total_euclidean = np.sum(euclidean_distances)
avg_euclidean = np.mean(euclidean_distances)
# Calculate RMSE
rmse = calculate_rmse(ref_temp, comp_temp, numeric_cols)
# Calculate correlation
correlation, p_value = calculate_correlation(ref_temp, comp_temp, numeric_cols)
# Store results
results[Path(comp_file).name] = {
'total_euclidean': total_euclidean,
'avg_euclidean': avg_euclidean,
'rmse': rmse,
'correlation': correlation,
'p_value': p_value,
'per_protocol_distances': euclidean_distances,
'protocols': ref_df['Descriptor'].values if 'Descriptor' in ref_df.columns else None
}
# Print results
print(f" Total Euclidean Distance: {total_euclidean:.2f}")
print(f" Average Euclidean Distance: {avg_euclidean:.2f}")
print(f" RMSE: {rmse:.2f}")
print(f" Pearson Correlation: {correlation:.4f} (p={p_value:.2e})")
# Summary comparison
print("\n" + "="*80)
print("SUMMARY RANKING (lower distance = more similar)")
print("="*80)
# Sort by average Euclidean distance
sorted_by_euclidean = sorted(results.items(), key=lambda x: x[1]['avg_euclidean'])
print("\nBy Average Euclidean Distance:")
for i, (name, data) in enumerate(sorted_by_euclidean, 1):
print(f" {i}. {name:30s} - Avg Distance: {data['avg_euclidean']:.2f}")
# Sort by correlation (higher is better)
sorted_by_corr = sorted(results.items(), key=lambda x: x[1]['correlation'], reverse=True)
print("\nBy Correlation (higher = more similar):")
for i, (name, data) in enumerate(sorted_by_corr, 1):
print(f" {i}. {name:30s} - Correlation: {data['correlation']:.4f}")
# Sort by RMSE
sorted_by_rmse = sorted(results.items(), key=lambda x: x[1]['rmse'])
print("\nBy RMSE (lower = more similar):")
for i, (name, data) in enumerate(sorted_by_rmse, 1):
print(f" {i}. {name:30s} - RMSE: {data['rmse']:.2f}")
# Show protocols with largest differences for the best match
print("\n" + "="*80)
best_match_name, best_match_data = sorted_by_euclidean[0]
print(f"Top 10 protocols with largest differences from {best_match_name}:")
print("="*80)
if best_match_data['protocols'] is not None:
distances = best_match_data['per_protocol_distances']
protocols = best_match_data['protocols']
top_diff_indices = np.argsort(distances)[-10:][::-1]
for idx in top_diff_indices:
print(f" {protocols[idx]:50s} - Distance: {distances[idx]:.2f}")
return results
if __name__ == "__main__":
# Define file paths
reference_file = "data/readings/synthetic_20251116/readings_manual.csv"
comparison_files = [
"data/readings/synthetic_20251116/readings_gemma3-12b.csv",
"data/readings/synthetic_20251116/readings_gpt-oss.csv",
"data/readings/synthetic_20251116/readings_mistral.csv"
]
# Check if files exist
if not Path(reference_file).exists():
print(f"Error: Reference file '{reference_file}' not found")
exit(1)
for file in comparison_files:
if not Path(file).exists():
print(f"Warning: Comparison file '{file}' not found, skipping...")
comparison_files.remove(file)
if not comparison_files:
print("Error: No comparison files found")
exit(1)
# Run comparison
results = compare_analyses(reference_file, comparison_files)
print("\n" + "="*80)
print("Analysis complete!")
print("="*80)

View File

@@ -0,0 +1,189 @@
#!/usr/bin/env python3
"""
Convert diagnostic_output.csv to individual JSON files following the bicorder.json spec.
Handles mapping between old CSV column names and current spec terminology.
"""
import csv
import json
import os
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional
import statistics
# Mapping from CSV columns to spec terms
# Format: (csv_column_suffix, set_name, term_left, term_right)
GRADIENT_MAPPINGS = [
# Design set
("explicit_vs_implicit", "Design", "explicit", "implicit"),
("precise_vs_interpretive", "Design", "precise", "interpretive"),
("elite_vs_vernacular", "Design", "institutional", "vernacular"), # Changed: elite → institutional
("documenting_vs_enabling", "Design", "documenting", "enabling"),
("static_vs_malleable", "Design", "static", "malleable"),
("technical_vs_social", "Design", "technical", "social"),
("universal_vs_particular", "Design", "universal", "particular"),
("durable_vs_ephemeral", "Design", "durable", "ephemeral"),
# Entanglement set
("macro_vs_micro", "Entanglement", "macro", "micro"),
("sovereign_vs_subsidiary", "Entanglement", "sovereign", "subsidiary"),
("self-enforcing_vs_enforced", "Entanglement", "self-enforcing", "enforced"),
("abstract_vs_embodied", "Entanglement", "abstract", "embodied"),
("obligatory_vs_voluntary", "Entanglement", "obligatory", "voluntary"),
("flocking_vs_swarming", "Entanglement", "flocking", "swarming"),
("defensible_vs_exposed", "Entanglement", "defensible", "exposed"),
("exclusive_vs_non-exclusive", "Entanglement", "monopolistic", "pluralistic"), # Changed: exclusive → monopolistic
# Experience set
("sufficient_vs_insufficient", "Experience", "sufficient", "limited"), # Changed: insufficient → limited
("crystallized_vs_contested", "Experience", "crystallized", "contested"),
("trust-evading_vs_trust-inducing", "Experience", "trust-evading", "trust-inducing"),
("predictable_vs_emergent", "Experience", "predictable", "emergent"),
("exclusion_vs_inclusion", "Experience", "exclusion", "inclusion"),
("Kafka_vs_Whitehead", "Experience", "restraining", "liberating"), # Changed: Kafka_vs_Whitehead → restraining_vs_liberating
("dead_vs_alive", "Experience", "dead", "alive"),
]
def load_spec_template(spec_path: str) -> Dict[str, Any]:
"""Load the bicorder.json spec as a template."""
with open(spec_path, 'r') as f:
return json.load(f)
def calculate_hardness(gradient_values: List[Optional[int]]) -> Optional[int]:
"""
Calculate hardness: mean of all gradient values, rounded to nearest integer.
Returns None if there are no valid values.
"""
valid_values = [v for v in gradient_values if v is not None]
if not valid_values:
return None
return round(statistics.mean(valid_values))
def calculate_polarization(gradient_values: List[Optional[int]]) -> Optional[int]:
"""
Calculate polarization: degree to which values are extreme vs centered.
If all values are 1 or 9 (max polarization), return 1.
If all values are 5 (centered), return 9.
Returns None if there are no valid values.
"""
valid_values = [v for v in gradient_values if v is not None]
if not valid_values:
return None
# Calculate average distance from center (5)
distances = [abs(v - 5) for v in valid_values]
avg_distance = statistics.mean(distances)
# Max distance is 4 (from 1 or 9 to 5)
# Convert to scale where 4 → 1 (polarized) and 0 → 9 (centrist)
# Linear mapping: polarization = 9 - (avg_distance / 4) * 8
polarization = 9 - (avg_distance / 4) * 8
return round(polarization)
def create_json_from_row(row: Dict[str, str], template: Dict[str, Any]) -> Dict[str, Any]:
"""Convert a CSV row to a JSON object following the spec."""
result = json.loads(json.dumps(template)) # Deep copy
# Update metadata
result["metadata"]["protocol"] = row["Descriptor"]
result["metadata"]["description"] = row["Description"]
result["metadata"]["analyst"] = row["analyst"]
result["metadata"]["standpoint"] = row["standpoint"]
result["metadata"]["timestamp"] = None # Not in CSV
# Collect gradient values for analysis calculations
gradient_values = []
# Map CSV values to gradient objects
for csv_suffix, set_name, term_left, term_right in GRADIENT_MAPPINGS:
csv_column = f"{set_name}_{csv_suffix}"
# Get the value from CSV (may be empty string)
csv_value = row.get(csv_column, "").strip()
value = int(csv_value) if csv_value else None
if value is not None:
gradient_values.append(value)
# Find the corresponding gradient in the template
for diagnostic_set in result["diagnostic"]:
if diagnostic_set["set_name"] == set_name:
for gradient in diagnostic_set["gradients"]:
if gradient["term_left"] == term_left and gradient["term_right"] == term_right:
gradient["value"] = value
break
# Calculate automated analysis fields
result["analysis"][0]["value"] = calculate_hardness(gradient_values) # hardness
result["analysis"][1]["value"] = calculate_polarization(gradient_values) # polarized
# analysis[2] is bureaucratic (LDA-based) - leave as null
# analysis[3] is usefulness - leave as null (not automated)
return result
def main():
"""Main conversion process."""
import argparse
parser = argparse.ArgumentParser(
description='Convert diagnostic readings CSV to individual JSON files',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Example usage:
python3 scripts/convert_csv_to_json.py data/readings/synthetic_20251116/readings.csv
python3 scripts/convert_csv_to_json.py data/readings/manual_20260101/readings.csv --output-dir data/readings/manual_20260101/json
"""
)
parser.add_argument('input_csv', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
parser.add_argument('--output-dir', default=None,
help='Output directory for JSON files (default: <dataset_dir>/json)')
parser.add_argument('--bicorder', default='../bicorder.json',
help='Path to bicorder.json (default: ../bicorder.json)')
args = parser.parse_args()
csv_path = args.input_csv
spec_path = args.bicorder
output_dir = args.output_dir if args.output_dir else str(Path(args.input_csv).parent / 'json')
# Create output directory
os.makedirs(output_dir, exist_ok=True)
# Load template
template = load_spec_template(spec_path)
# Process CSV
with open(csv_path, 'r', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile)
count = 0
for i, row in enumerate(reader, start=1):
# Create JSON object
json_obj = create_json_from_row(row, template)
# Generate filename from protocol name
protocol_name = row["Descriptor"]
# Sanitize filename
filename = protocol_name.replace("/", "_").replace("\\", "_")
filename = f"{i:03d}_{filename}.json"
# Write to file
output_path = os.path.join(output_dir, filename)
with open(output_path, 'w', encoding='utf-8') as jsonfile:
json.dump(json_obj, jsonfile, indent=2)
count += 1
if count % 50 == 0:
print(f"Processed {count} protocols...")
print(f"\nConversion complete! Created {count} JSON files in {output_dir}/")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,172 @@
#!/usr/bin/env python3
"""
Export the cluster classification model to JSON for use in JavaScript.
Reads dimension names directly from bicorder.json so the model always
stays in sync with the current bicorder structure.
When gradients are renamed in bicorder.json, add the old→new mapping to
COLUMN_RENAMES so the training CSV columns are correctly aligned.
Usage:
python3 scripts/export_model_for_js.py data/readings/synthetic_20251116/readings.csv
python3 scripts/export_model_for_js.py data/readings/manual_20260101/readings.csv --output bicorder_model.json
"""
import argparse
import json
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Path to bicorder.json (relative to this script)
BICORDER_JSON = Path(__file__).parent.parent.parent / 'bicorder.json'
# Historical column renames: maps old CSV column names → current bicorder.json names.
# Add an entry here whenever gradient terms are renamed in bicorder.json.
COLUMN_RENAMES = {
'Design_elite_vs_vernacular': 'Design_institutional_vs_vernacular',
'Entanglement_exclusive_vs_non-exclusive': 'Entanglement_monopolistic_vs_pluralistic',
'Experience_sufficient_vs_insufficient': 'Experience_sufficient_vs_limited',
'Experience_Kafka_vs_Whitehead': 'Experience_restraining_vs_liberating',
}
def load_bicorder_dimensions(bicorder_path):
"""Read DIMENSIONS and KEY_DIMENSIONS from bicorder.json."""
with open(bicorder_path) as f:
data = json.load(f)
dimensions = []
key_dimensions = []
for category in data['diagnostic']:
set_name = category['set_name']
for gradient in category['gradients']:
dim_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
dimensions.append(dim_name)
if gradient.get('shortform', False):
key_dimensions.append(dim_name)
return dimensions, key_dimensions, data['version']
def main():
parser = argparse.ArgumentParser(
description='Export cluster classification model to JSON for JavaScript',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Example usage:
python3 scripts/export_model_for_js.py data/readings/synthetic_20251116/readings.csv
python3 scripts/export_model_for_js.py data/readings/manual_20260101/readings.csv --output bicorder_model.json
"""
)
parser.add_argument('input_csv', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
parser.add_argument('--output', default='bicorder_model.json',
help='Output model JSON path (default: bicorder_model.json)')
args = parser.parse_args()
dataset_dir = Path(args.input_csv).parent
analysis_dir = dataset_dir / 'analysis'
# Derive dimensions and version from bicorder.json
DIMENSIONS, KEY_DIMENSIONS, BICORDER_VERSION = load_bicorder_dimensions(BICORDER_JSON)
print(f"Loaded bicorder.json v{BICORDER_VERSION}")
print(f"Dimensions: {len(DIMENSIONS)}, key dimensions: {len(KEY_DIMENSIONS)}")
# Load data
df = pd.read_csv(args.input_csv)
clusters = pd.read_csv(analysis_dir / 'data' / 'kmeans_clusters.csv')
# Rename old column names to match current bicorder.json
df = df.rename(columns=COLUMN_RENAMES)
# Remove duplicates
df = df.drop_duplicates(subset='Descriptor', keep='first')
# Merge and clean
merged = df.merge(clusters, on='Descriptor')
merged_clean = merged.dropna(subset=DIMENSIONS)
print(f"Training on {len(merged_clean)} protocols")
# Prepare training data
X = merged_clean[DIMENSIONS].values
y = merged_clean['cluster'].values
# Fit scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Fit LDA
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(X_scaled, y)
# Calculate cluster centroids in scaled space
cluster_centroids = {}
for cluster_id in [1, 2]:
cluster_data = X_scaled[y == cluster_id]
cluster_centroids[cluster_id] = cluster_data.mean(axis=0).tolist()
# Calculate cluster means in original space (for reference)
cluster_means_original = {}
for cluster_id in [1, 2]:
cluster_data_original = X[y == cluster_id]
cluster_means_original[cluster_id] = cluster_data_original.mean(axis=0).tolist()
# Build model export
model = {
'version': '1.0',
'bicorder_version': BICORDER_VERSION,
'generated': pd.Timestamp.now().isoformat(),
'dimensions': DIMENSIONS,
'key_dimensions': KEY_DIMENSIONS,
'cluster_names': {
'1': 'Relational/Cultural',
'2': 'Institutional/Bureaucratic'
},
'cluster_descriptions': {
'1': 'Community-based, emergent, voluntary, cultural protocols',
'2': 'Formal, institutional, top-down, bureaucratic protocols'
},
'scaler': {
'mean': scaler.mean_.tolist(),
'scale': scaler.scale_.tolist()
},
'lda': {
'coefficients': lda.coef_[0].tolist(),
'intercept': lda.intercept_[0]
},
'cluster_centroids_scaled': cluster_centroids,
'cluster_means_original': cluster_means_original,
'thresholds': {
'confidence_low': 0.6,
'completeness_low': 0.5,
'boundary_distance_low': 0.5
},
'metadata': {
'total_protocols': len(merged_clean),
'cluster_1_count': int((y == 1).sum()),
'cluster_2_count': int((y == 2).sum()),
}
}
# Save to JSON
with open(args.output, 'w') as f:
json.dump(model, f, indent=2)
print(f"\nModel exported to {args.output}")
print(f"Bicorder version: {BICORDER_VERSION}")
print(f"Total dimensions: {len(DIMENSIONS)}")
print(f"Key dimensions (short form):")
for dim in KEY_DIMENSIONS:
print(f" - {dim}")
print(f"Model size: {len(json.dumps(model))} bytes")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,164 @@
#!/usr/bin/env python3
"""
Convert a directory of individual bicorder JSON reading files into a diagnostic CSV.
This is the reverse of convert_csv_to_json.py. Each JSON file becomes one row.
Handles readings across bicorder versions by matching on term_left/term_right pairs
rather than column names.
Null gradient values (e.g., shortform readings that skip non-key dimensions) are
written as empty cells so downstream analysis can treat them as NaN.
Usage:
python3 scripts/json_to_csv.py data/readings/manual_20260320/json/
python3 scripts/json_to_csv.py data/readings/manual_20260320/json/ -o data/readings/manual_20260320/readings.csv
python3 scripts/json_to_csv.py data/readings/manual_20260320/json/ --shortform-only
"""
import argparse
import csv
import json
from pathlib import Path
# Map old term pairs to current column names (matches COLUMN_RENAMES in other scripts).
# Keys are (term_left, term_right) as found in older JSON files.
TERM_RENAMES = {
('elite', 'vernacular'): ('institutional', 'vernacular'),
('exclusive', 'non-exclusive'): ('monopolistic', 'pluralistic'),
('insufficient', 'sufficient'): ('sufficient', 'limited'), # note: order swapped in old versions
('Kafka', 'Whitehead'): ('restraining', 'liberating'),
}
def load_bicorder_columns(bicorder_path):
"""Read ordered column definitions from bicorder.json."""
with open(bicorder_path) as f:
data = json.load(f)
columns = []
key_columns = set()
for category in data['diagnostic']:
set_name = category['set_name']
for gradient in category['gradients']:
col = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
columns.append(col)
if gradient.get('shortform', False):
key_columns.add(col)
return columns, key_columns
def normalize_terms(term_left, term_right):
"""Apply renames to match current bicorder.json terminology."""
pair = (term_left, term_right)
if pair in TERM_RENAMES:
return TERM_RENAMES[pair]
# Also check reversed pair (some old files had swapped left/right)
reversed_pair = (term_right, term_left)
if reversed_pair in TERM_RENAMES:
new_left, new_right = TERM_RENAMES[reversed_pair]
return new_right, new_left # swap back
return term_left, term_right
def json_to_row(json_path, all_columns):
"""Convert a single JSON reading file to a CSV row dict."""
with open(json_path) as f:
data = json.load(f)
meta = data.get('metadata', {})
row = {
'Descriptor': meta.get('protocol', ''),
'Description': '', # not stored in individual reading files
'analyst': meta.get('analyst', ''),
'standpoint': meta.get('standpoint', ''),
'timestamp': meta.get('timestamp', ''),
'shortform': str(meta.get('shortform', '')),
'version': data.get('version', ''),
}
# Build lookup: (normalized_term_left, normalized_term_right) -> value
gradient_values = {}
for category in data.get('diagnostic', []):
set_name = category['set_name']
for gradient in category.get('gradients', []):
tl = gradient['term_left']
tr = gradient['term_right']
tl_norm, tr_norm = normalize_terms(tl, tr)
col = f"{set_name}_{tl_norm}_vs_{tr_norm}"
value = gradient.get('value')
gradient_values[col] = '' if value is None else str(value)
for col in all_columns:
row[col] = gradient_values.get(col, '')
return row
def main():
parser = argparse.ArgumentParser(
description='Convert directory of bicorder JSON files to a diagnostic CSV',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Example usage:
python3 scripts/json_to_csv.py data/readings/manual_20260320/json/
python3 scripts/json_to_csv.py data/readings/manual_20260320/json/ --shortform-only
python3 scripts/json_to_csv.py data/readings/manual_20260320/json/ -o data/readings/manual_20260320/readings.csv
"""
)
parser.add_argument('json_dir', help='Directory containing bicorder JSON reading files')
parser.add_argument('-o', '--output', default=None,
help='Output CSV path (default: <dataset_dir>/readings.csv)')
parser.add_argument('-b', '--bicorder', default='../bicorder.json',
help='Path to bicorder.json (default: ../bicorder.json)')
parser.add_argument('--shortform-only', action='store_true',
help='Include only the key shortform dimensions (useful when most readings are shortform)')
args = parser.parse_args()
json_dir = Path(args.json_dir)
dataset_dir = json_dir.parent
output_path = Path(args.output) if args.output else dataset_dir / 'readings.csv'
all_columns, key_columns = load_bicorder_columns(args.bicorder)
if args.shortform_only:
columns = [c for c in all_columns if c in key_columns]
print(f"Shortform mode: using {len(columns)} key dimensions")
else:
columns = all_columns
json_files = sorted(json_dir.glob('*.json'))
if not json_files:
print(f"Error: no JSON files found in {json_dir}")
return
print(f"Converting {len(json_files)} JSON files → {output_path}")
fieldnames = ['Descriptor', 'Description', 'analyst', 'standpoint',
'timestamp', 'shortform', 'version'] + columns
rows = []
for json_path in json_files:
try:
row = json_to_row(json_path, columns)
rows.append(row)
except Exception as e:
print(f" Warning: skipping {json_path.name}: {e}")
with open(output_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
# Summary stats
filled = {col: sum(1 for r in rows if r.get(col)) for col in columns}
print(f"Done. {len(rows)} rows written.")
print(f"\nDimension coverage (readings with a value):")
for col, count in filled.items():
pct = count / len(rows) * 100 if rows else 0
marker = '* ' if col in key_columns else ' '
print(f" {marker}{col}: {count}/{len(rows)} ({pct:.0f}%)")
print(f"\n(* = shortform/key dimension)")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,177 @@
#!/usr/bin/env python3
"""
Create LDA visualization to maximize cluster separation.
Usage:
python3 scripts/lda_visualization.py data/readings/synthetic_20251116.csv
python3 scripts/lda_visualization.py data/readings/synthetic_20251116.csv --results-dir analysis_results/synthetic_20251116
"""
import argparse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from pathlib import Path
def main():
parser = argparse.ArgumentParser(
description='Create LDA visualization of cluster separation',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Example usage:
python3 scripts/lda_visualization.py data/readings/synthetic_20251116/readings.csv
python3 scripts/lda_visualization.py data/readings/manual_20260101/readings.csv --analysis-dir data/readings/manual_20260101/analysis
"""
)
parser.add_argument('input_csv', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
parser.add_argument('--analysis-dir', default=None,
help='Analysis directory (default: <dataset_dir>/analysis)')
args = parser.parse_args()
dataset_dir = Path(args.input_csv).parent
results_dir = Path(args.analysis_dir) if args.analysis_dir else dataset_dir / 'analysis'
plots_dir = results_dir / 'plots'
data_dir = results_dir / 'data'
# Load the original data
df = pd.read_csv(args.input_csv)
# Identify dimension columns
all_cols = df.columns.tolist()
design_cols = [c for c in all_cols if c.startswith('Design_')]
entanglement_cols = [c for c in all_cols if c.startswith('Entanglement_')]
experience_cols = [c for c in all_cols if c.startswith('Experience_')]
dimension_cols = design_cols + entanglement_cols + experience_cols
# Load cluster assignments
clusters = pd.read_csv(data_dir / 'kmeans_clusters.csv')
df_with_clusters = df.merge(clusters, on='Descriptor')
# Drop dimension columns with low coverage (< 80%) to handle shortform datasets
n = len(df_with_clusters)
coverage = df_with_clusters[dimension_cols].notna().sum() / n
dimension_cols = [c for c in dimension_cols if coverage[c] >= 0.8]
# Prepare data — impute any remaining NaNs with column median
X_df = df_with_clusters[dimension_cols].copy()
X_df = X_df.fillna(X_df.median())
X = X_df.values
y = df_with_clusters['cluster'].values
# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Fit LDA (with 1 component for 2 classes)
lda = LinearDiscriminantAnalysis(n_components=1)
X_lda = lda.fit_transform(X_scaled, y).ravel()
# Create histogram showing separation
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))
# Histogram
colors = {1: '#2E86AB', 2: '#A23B72'}
for cluster_id in [1, 2]:
cluster_data = X_lda[y == cluster_id]
ax1.hist(cluster_data, bins=30, alpha=0.6,
color=colors[cluster_id],
label=f'Cluster {cluster_id}',
edgecolor='white', linewidth=0.5)
ax1.set_xlabel('Linear Discriminant (LD1)', fontsize=12)
ax1.set_ylabel('Frequency', fontsize=12)
ax1.set_title('Linear Discriminant Analysis: Cluster Separation\n(Maximum separation projection)',
fontsize=14, fontweight='bold')
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3, axis='y')
# Strip plot - shows individual protocols
for cluster_id in [1, 2]:
cluster_data = X_lda[y == cluster_id]
cluster_protocols = df_with_clusters[df_with_clusters['cluster'] == cluster_id]['Descriptor'].values
# Add jitter for visibility
y_jitter = np.random.normal(cluster_id, 0.1, size=len(cluster_data))
ax2.scatter(cluster_data, y_jitter,
c=colors[cluster_id], alpha=0.5, s=40,
edgecolors='white', linewidth=0.3)
# Label a few representative protocols
for i in range(0, len(cluster_data), 25):
ax2.annotate(cluster_protocols[i],
(cluster_data[i], y_jitter[i]),
fontsize=7, alpha=0.7,
xytext=(0, 5), textcoords='offset points',
rotation=45, ha='left')
ax2.set_xlabel('Linear Discriminant (LD1)', fontsize=12)
ax2.set_ylabel('Cluster', fontsize=12)
ax2.set_yticks([1, 2])
ax2.set_yticklabels(['Cluster 1:\nRelational/Cultural', 'Cluster 2:\nInstitutional/Bureaucratic'])
ax2.set_title('Individual Protocols Projected onto Discriminant Axis', fontsize=12)
ax2.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig(plots_dir / 'lda_cluster_separation.png', dpi=300, bbox_inches='tight')
print(f"Saved: {plots_dir / 'lda_cluster_separation.png'}")
# Calculate separation metrics
mean_1 = X_lda[y == 1].mean()
mean_2 = X_lda[y == 2].mean()
std_1 = X_lda[y == 1].std()
std_2 = X_lda[y == 2].std()
# Cohen's d (effect size)
pooled_std = np.sqrt((std_1**2 + std_2**2) / 2)
cohens_d = abs(mean_1 - mean_2) / pooled_std
print(f"\n=== Cluster Separation Statistics ===")
mean_1_val = mean_1[0] if isinstance(mean_1, np.ndarray) else mean_1
mean_2_val = mean_2[0] if isinstance(mean_2, np.ndarray) else mean_2
cohens_d_val = cohens_d[0] if isinstance(cohens_d, np.ndarray) else cohens_d
print(f"Cluster 1 mean: {mean_1_val:.3f} (std: {std_1:.3f})")
print(f"Cluster 2 mean: {mean_2_val:.3f} (std: {std_2:.3f})")
print(f"Distance between means: {abs(mean_1_val - mean_2_val):.3f}")
print(f"Cohen's d (effect size): {cohens_d_val:.3f}")
print(f" (0.2=small, 0.5=medium, 0.8=large effect)")
# Overlap percentage (rough estimate)
overlap_start = max(X_lda[y == 1].min(), X_lda[y == 2].min())
overlap_end = min(X_lda[y == 1].max(), X_lda[y == 2].max())
overlap_range = overlap_end - overlap_start if overlap_end > overlap_start else 0
total_range = X_lda.max() - X_lda.min()
overlap_pct = (overlap_range / total_range) * 100 if overlap_range > 0 else 0
print(f"Approximate overlap: {overlap_pct:.1f}% of total range")
# Save LDA projection data
lda_df = pd.DataFrame({
'Descriptor': df_with_clusters['Descriptor'],
'LD1': X_lda.flatten(),
'Cluster': y
})
lda_df.to_csv(data_dir / 'lda_projection.csv', index=False)
print(f"Saved: {data_dir / 'lda_projection.csv'}")
print("\n=== Most discriminating dimensions ===")
loadings = pd.DataFrame({
'Dimension': dimension_cols,
'LDA_Coefficient': lda.coef_[0]
})
loadings['Abs_Coefficient'] = loadings['LDA_Coefficient'].abs()
loadings = loadings.sort_values('Abs_Coefficient', ascending=False)
print("\nTop 10 dimensions that separate the clusters:")
for _, row in loadings.head(10).iterrows():
print(f" {row['Dimension']}: {row['LDA_Coefficient']:.3f}")
loadings.to_csv(data_dir / 'lda_coefficients.csv', index=False)
print(f"\nSaved: {data_dir / 'lda_coefficients.csv'}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,858 @@
#!/usr/bin/env python3
"""
Multivariate Analysis Script for Protocol Bicorder Data
Performs comprehensive multivariate statistical analyses on protocol diagnostic data,
including clustering, dimensionality reduction, correlation analysis, and visualization.
Usage:
python3 multivariate_analysis.py diagnostic_output.csv [--analyses all]
python3 multivariate_analysis.py diagnostic_output.csv --analyses clustering pca
"""
import argparse
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import pdist, squareform
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import silhouette_score, davies_bouldin_score
try:
import umap
UMAP_AVAILABLE = True
except ImportError:
UMAP_AVAILABLE = False
print("Note: UMAP not available. Install with: pip install umap-learn")
class ProtocolAnalyzer:
"""Main class for multivariate analysis of protocol data."""
def __init__(self, csv_path, output_dir='analysis_results', min_coverage=0.0):
"""Initialize analyzer with data and output directory.
Args:
csv_path: Path to diagnostic CSV file
output_dir: Directory for analysis output
min_coverage: Drop dimension columns with fewer than this fraction of
non-null values (0.01.0). Useful for sparse/shortform
datasets. E.g. 0.8 keeps only columns with ≥80% coverage.
"""
self.csv_path = Path(csv_path)
self.output_dir = Path(output_dir)
self.min_coverage = min_coverage
self.output_dir.mkdir(exist_ok=True)
# Create subdirectories
(self.output_dir / 'plots').mkdir(exist_ok=True)
(self.output_dir / 'data').mkdir(exist_ok=True)
(self.output_dir / 'reports').mkdir(exist_ok=True)
# Load and prepare data
self.df = None
self.dimension_cols = []
self.design_cols = []
self.entanglement_cols = []
self.experience_cols = []
self.scaled_data = None
self.scaler = None
self._load_data()
def _load_data(self):
"""Load CSV and identify dimension columns."""
print(f"Loading data from {self.csv_path}...")
self.df = pd.read_csv(self.csv_path)
# Identify dimension columns
all_cols = self.df.columns.tolist()
self.design_cols = [c for c in all_cols if c.startswith('Design_')]
self.entanglement_cols = [c for c in all_cols if c.startswith('Entanglement_')]
self.experience_cols = [c for c in all_cols if c.startswith('Experience_')]
self.dimension_cols = self.design_cols + self.entanglement_cols + self.experience_cols
print(f"Loaded {len(self.df)} protocols with {len(self.dimension_cols)} dimensions")
print(f" - Design: {len(self.design_cols)}")
print(f" - Entanglement: {len(self.entanglement_cols)}")
print(f" - Experience: {len(self.experience_cols)}")
# Drop low-coverage columns if min_coverage is set
if self.min_coverage > 0.0:
n = len(self.df)
coverage = self.df[self.dimension_cols].notna().sum() / n
dropped = [c for c in self.dimension_cols if coverage[c] < self.min_coverage]
if dropped:
print(f"\nDropping {len(dropped)} dimension(s) below {self.min_coverage:.0%} coverage:")
for c in dropped:
print(f" - {c}: {coverage[c]:.0%}")
self.dimension_cols = [c for c in self.dimension_cols if c not in dropped]
self.design_cols = [c for c in self.design_cols if c not in dropped]
self.entanglement_cols = [c for c in self.entanglement_cols if c not in dropped]
self.experience_cols = [c for c in self.experience_cols if c not in dropped]
print(f"Remaining dimensions: {len(self.dimension_cols)}")
# Check for missing values
missing_count = self.df[self.dimension_cols].isna().sum().sum()
rows_with_missing = self.df[self.dimension_cols].isna().any(axis=1).sum()
if missing_count > 0:
print(f"\nWarning: Found {missing_count} missing values in {rows_with_missing} rows")
print("Dropping rows with missing values...")
self.df = self.df.dropna(subset=self.dimension_cols)
print(f"Dataset now contains {len(self.df)} protocols")
# Standardize the dimension data
self.scaler = StandardScaler()
self.scaled_data = self.scaler.fit_transform(self.df[self.dimension_cols])
def save_results(self, data, filename, subdir='data'):
"""Save results to CSV file."""
output_path = self.output_dir / subdir / filename
if isinstance(data, pd.DataFrame):
data.to_csv(output_path, index=False)
else:
pd.DataFrame(data).to_csv(output_path)
print(f" Saved: {output_path}")
def save_plot(self, filename):
"""Save current matplotlib figure."""
output_path = self.output_dir / 'plots' / filename
plt.tight_layout()
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f" Saved: {output_path}")
plt.close()
# ========== CLUSTERING ANALYSES ==========
def kmeans_clustering(self, n_clusters_range=(2, 10)):
"""Perform K-means clustering with elbow method."""
print("\n=== K-Means Clustering ===")
# Elbow method
inertias = []
silhouettes = []
k_range = range(n_clusters_range[0], n_clusters_range[1] + 1)
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = kmeans.fit_predict(self.scaled_data)
inertias.append(kmeans.inertia_)
if k > 1:
silhouettes.append(silhouette_score(self.scaled_data, labels))
else:
silhouettes.append(0)
# Plot elbow curve
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ax1.plot(k_range, inertias, 'bo-')
ax1.set_xlabel('Number of Clusters (k)')
ax1.set_ylabel('Inertia')
ax1.set_title('Elbow Method for Optimal k')
ax1.grid(True, alpha=0.3)
ax2.plot(k_range, silhouettes, 'ro-')
ax2.set_xlabel('Number of Clusters (k)')
ax2.set_ylabel('Silhouette Score')
ax2.set_title('Silhouette Score by k')
ax2.grid(True, alpha=0.3)
self.save_plot('kmeans_elbow.png')
# Use optimal k (highest silhouette)
optimal_k = k_range[np.argmax(silhouettes)]
print(f"Optimal k by silhouette score: {optimal_k}")
# Final clustering
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
self.df['kmeans_cluster'] = kmeans.fit_predict(self.scaled_data)
# Save results
results = self.df[['Descriptor', 'kmeans_cluster']].copy()
results['cluster'] = results['kmeans_cluster'] + 1 # 1-indexed for readability
self.save_results(results[['Descriptor', 'cluster']], 'kmeans_clusters.csv')
# Cluster statistics
print(f"\nCluster sizes:")
print(self.df['kmeans_cluster'].value_counts().sort_index())
return self.df['kmeans_cluster']
def hierarchical_clustering(self, n_clusters=5, method='ward'):
"""Perform hierarchical clustering with dendrogram."""
print("\n=== Hierarchical Clustering ===")
# Compute linkage
Z = linkage(self.scaled_data, method=method)
# Plot dendrogram
plt.figure(figsize=(16, 8))
dendrogram(Z, labels=self.df['Descriptor'].values, leaf_font_size=8)
plt.title(f'Hierarchical Clustering Dendrogram ({method} linkage)')
plt.xlabel('Protocol')
plt.ylabel('Distance')
plt.xticks(rotation=90)
self.save_plot('hierarchical_dendrogram.png')
# Cut tree to get clusters
self.df['hierarchical_cluster'] = fcluster(Z, n_clusters, criterion='maxclust')
# Save results
results = self.df[['Descriptor', 'hierarchical_cluster']].copy()
results.columns = ['Descriptor', 'cluster']
self.save_results(results, 'hierarchical_clusters.csv')
print(f"\nCluster sizes:")
print(self.df['hierarchical_cluster'].value_counts().sort_index())
return self.df['hierarchical_cluster']
def dbscan_clustering(self, eps=3.0, min_samples=3):
"""Perform DBSCAN clustering to identify outliers."""
print("\n=== DBSCAN Clustering ===")
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
self.df['dbscan_cluster'] = dbscan.fit_predict(self.scaled_data)
n_clusters = len(set(self.df['dbscan_cluster'])) - (1 if -1 in self.df['dbscan_cluster'] else 0)
n_outliers = (self.df['dbscan_cluster'] == -1).sum()
print(f"Found {n_clusters} clusters and {n_outliers} outliers")
# Save results
results = self.df[['Descriptor', 'dbscan_cluster']].copy()
results.columns = ['Descriptor', 'cluster']
self.save_results(results, 'dbscan_clusters.csv')
if n_outliers > 0:
outliers = self.df[self.df['dbscan_cluster'] == -1][['Descriptor']]
self.save_results(outliers, 'dbscan_outliers.csv')
print("\nOutlier protocols:")
for protocol in outliers['Descriptor']:
print(f" - {protocol}")
return self.df['dbscan_cluster']
# ========== DIMENSIONALITY REDUCTION ==========
def pca_analysis(self, n_components=None):
"""Perform PCA and visualize results."""
print("\n=== Principal Component Analysis ===")
# Fit PCA
if n_components is None:
pca = PCA()
else:
pca = PCA(n_components=n_components)
pca_coords = pca.fit_transform(self.scaled_data)
# Explained variance
explained_var = pca.explained_variance_ratio_
cumsum_var = np.cumsum(explained_var)
print(f"First 5 PCs explain {cumsum_var[4]*100:.1f}% of variance")
# Plot explained variance
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
n_show = min(15, len(explained_var))
ax1.bar(range(1, n_show + 1), explained_var[:n_show])
ax1.set_xlabel('Principal Component')
ax1.set_ylabel('Explained Variance Ratio')
ax1.set_title('Variance Explained by Each PC')
ax1.grid(True, alpha=0.3, axis='y')
ax2.plot(range(1, n_show + 1), cumsum_var[:n_show], 'o-')
ax2.axhline(y=0.8, color='r', linestyle='--', alpha=0.5, label='80% threshold')
ax2.set_xlabel('Number of Components')
ax2.set_ylabel('Cumulative Explained Variance')
ax2.set_title('Cumulative Variance Explained')
ax2.legend()
ax2.grid(True, alpha=0.3)
self.save_plot('pca_variance.png')
# 2D visualization
plt.figure(figsize=(12, 10))
plt.scatter(pca_coords[:, 0], pca_coords[:, 1], alpha=0.6, s=50)
# Annotate points
for i, protocol in enumerate(self.df['Descriptor']):
if i % 3 == 0: # Label every 3rd point to avoid clutter
plt.annotate(protocol, (pca_coords[i, 0], pca_coords[i, 1]),
fontsize=6, alpha=0.7)
plt.xlabel(f'PC1 ({explained_var[0]*100:.1f}% variance)')
plt.ylabel(f'PC2 ({explained_var[1]*100:.1f}% variance)')
plt.title('Protocols in PCA Space (First 2 Components)')
plt.grid(True, alpha=0.3)
self.save_plot('pca_2d.png')
# Save PCA coordinates
pca_df = pd.DataFrame(pca_coords[:, :5],
columns=[f'PC{i+1}' for i in range(min(5, pca_coords.shape[1]))])
pca_df.insert(0, 'Descriptor', self.df['Descriptor'])
self.save_results(pca_df, 'pca_coordinates.csv')
# Component loadings
loadings = pd.DataFrame(
pca.components_[:5, :].T,
columns=[f'PC{i+1}' for i in range(min(5, pca.components_.shape[0]))],
index=self.dimension_cols
)
self.save_results(loadings, 'pca_loadings.csv')
# Plot loadings heatmap
plt.figure(figsize=(10, 12))
sns.heatmap(loadings, cmap='RdBu_r', center=0, cbar_kws={'label': 'Loading'})
plt.title('PCA Component Loadings')
plt.tight_layout()
self.save_plot('pca_loadings_heatmap.png')
return pca_coords, pca
def tsne_analysis(self, perplexity=30, n_components=2):
"""Perform t-SNE analysis."""
print("\n=== t-SNE Analysis ===")
tsne = TSNE(n_components=n_components, perplexity=perplexity, random_state=42, max_iter=1000)
tsne_coords = tsne.fit_transform(self.scaled_data)
# Plot
plt.figure(figsize=(12, 10))
plt.scatter(tsne_coords[:, 0], tsne_coords[:, 1], alpha=0.6, s=50)
# Annotate some points
for i, protocol in enumerate(self.df['Descriptor']):
if i % 4 == 0: # Label every 4th point
plt.annotate(protocol, (tsne_coords[i, 0], tsne_coords[i, 1]),
fontsize=6, alpha=0.7)
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.title(f't-SNE Projection (perplexity={perplexity})')
plt.grid(True, alpha=0.3)
self.save_plot('tsne_2d.png')
# Save coordinates
tsne_df = pd.DataFrame(tsne_coords, columns=['TSNE1', 'TSNE2'])
tsne_df.insert(0, 'Descriptor', self.df['Descriptor'])
self.save_results(tsne_df, 'tsne_coordinates.csv')
return tsne_coords
def umap_analysis(self, n_neighbors=15, min_dist=0.1, n_components=2):
"""Perform UMAP analysis if available."""
if not UMAP_AVAILABLE:
print("\n=== UMAP Analysis ===")
print("UMAP not available. Install with: pip install umap-learn")
return None
print("\n=== UMAP Analysis ===")
reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist,
n_components=n_components, random_state=42)
umap_coords = reducer.fit_transform(self.scaled_data)
# Plot
plt.figure(figsize=(12, 10))
plt.scatter(umap_coords[:, 0], umap_coords[:, 1], alpha=0.6, s=50)
# Annotate some points
for i, protocol in enumerate(self.df['Descriptor']):
if i % 4 == 0:
plt.annotate(protocol, (umap_coords[i, 0], umap_coords[i, 1]),
fontsize=6, alpha=0.7)
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.title(f'UMAP Projection (n_neighbors={n_neighbors}, min_dist={min_dist})')
plt.grid(True, alpha=0.3)
self.save_plot('umap_2d.png')
# Save coordinates
umap_df = pd.DataFrame(umap_coords, columns=['UMAP1', 'UMAP2'])
umap_df.insert(0, 'Descriptor', self.df['Descriptor'])
self.save_results(umap_df, 'umap_coordinates.csv')
return umap_coords
def factor_analysis(self, n_factors=5):
"""Perform factor analysis."""
print("\n=== Factor Analysis ===")
fa = FactorAnalysis(n_components=n_factors, random_state=42)
fa_coords = fa.fit_transform(self.scaled_data)
# Factor loadings
loadings = pd.DataFrame(
fa.components_.T,
columns=[f'Factor{i+1}' for i in range(n_factors)],
index=self.dimension_cols
)
self.save_results(loadings, 'factor_loadings.csv')
# Plot loadings heatmap
plt.figure(figsize=(10, 12))
sns.heatmap(loadings, cmap='RdBu_r', center=0, cbar_kws={'label': 'Loading'})
plt.title('Factor Analysis Loadings')
plt.tight_layout()
self.save_plot('factor_loadings_heatmap.png')
# Save factor scores
fa_df = pd.DataFrame(fa_coords,
columns=[f'Factor{i+1}' for i in range(n_factors)])
fa_df.insert(0, 'Descriptor', self.df['Descriptor'])
self.save_results(fa_df, 'factor_scores.csv')
return fa_coords, fa
# ========== CORRELATION & STRUCTURE ==========
def correlation_analysis(self):
"""Compute and visualize correlation matrices."""
print("\n=== Correlation Analysis ===")
# Full correlation matrix
corr_matrix = self.df[self.dimension_cols].corr()
# Plot full correlation heatmap
plt.figure(figsize=(16, 14))
sns.heatmap(corr_matrix, cmap='RdBu_r', center=0,
square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'})
plt.title('Correlation Matrix - All Dimensions')
plt.tight_layout()
self.save_plot('correlation_heatmap_full.png')
# Save correlation matrix
self.save_results(corr_matrix, 'correlation_matrix.csv')
# Find strongest correlations
corr_pairs = []
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
corr_pairs.append({
'Dimension1': corr_matrix.columns[i],
'Dimension2': corr_matrix.columns[j],
'Correlation': corr_matrix.iloc[i, j]
})
corr_df = pd.DataFrame(corr_pairs).sort_values('Correlation',
key=abs,
ascending=False)
self.save_results(corr_df.head(20), 'top_correlations.csv')
print("\nTop 5 positive correlations:")
for _, row in corr_df.head(5).iterrows():
print(f" {row['Dimension1']} <-> {row['Dimension2']}: {row['Correlation']:.3f}")
print("\nTop 5 negative correlations:")
for _, row in corr_df.tail(5).iterrows():
print(f" {row['Dimension1']} <-> {row['Dimension2']}: {row['Correlation']:.3f}")
# Within-category correlations
self._plot_category_correlation('Design', self.design_cols)
self._plot_category_correlation('Entanglement', self.entanglement_cols)
self._plot_category_correlation('Experience', self.experience_cols)
return corr_matrix
def _plot_category_correlation(self, category_name, columns):
"""Plot correlation heatmap for a specific category."""
corr = self.df[columns].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r', center=0,
square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'})
plt.title(f'{category_name} Dimensions - Correlation Matrix')
plt.tight_layout()
self.save_plot(f'correlation_heatmap_{category_name.lower()}.png')
def network_analysis(self, threshold=0.5):
"""Create network graph of protocol similarities."""
print("\n=== Network Analysis ===")
# Compute pairwise distances
distances = pdist(self.scaled_data, metric='euclidean')
dist_matrix = squareform(distances)
# Convert to similarity (inverse of distance, normalized)
max_dist = dist_matrix.max()
similarity_matrix = 1 - (dist_matrix / max_dist)
# Create network
G = nx.Graph()
# Add nodes
for i, protocol in enumerate(self.df['Descriptor']):
G.add_node(i, label=protocol)
# Add edges above threshold
edge_count = 0
for i in range(len(similarity_matrix)):
for j in range(i+1, len(similarity_matrix)):
if similarity_matrix[i, j] > threshold:
G.add_edge(i, j, weight=similarity_matrix[i, j])
edge_count += 1
print(f"Network with {G.number_of_nodes()} nodes and {edge_count} edges")
# Calculate network metrics
if G.number_of_edges() > 0:
degree_centrality = nx.degree_centrality(G)
betweenness = nx.betweenness_centrality(G)
metrics_df = pd.DataFrame({
'Descriptor': [self.df.iloc[i]['Descriptor'] for i in G.nodes()],
'Degree_Centrality': [degree_centrality[i] for i in G.nodes()],
'Betweenness_Centrality': [betweenness[i] for i in G.nodes()]
}).sort_values('Degree_Centrality', ascending=False)
self.save_results(metrics_df, 'network_metrics.csv')
print("\nTop 5 most central protocols:")
for _, row in metrics_df.head(5).iterrows():
print(f" {row['Descriptor']}: {row['Degree_Centrality']:.3f}")
# Plot network
plt.figure(figsize=(16, 16))
pos = nx.spring_layout(G, k=0.5, iterations=50, seed=42)
# Node sizes based on degree centrality
node_sizes = [degree_centrality[i] * 3000 + 100 for i in G.nodes()]
nx.draw_networkx_nodes(G, pos, node_size=node_sizes,
node_color='lightblue', alpha=0.7)
nx.draw_networkx_edges(G, pos, alpha=0.2)
# Labels for high-centrality nodes
high_centrality = {i: self.df.iloc[i]['Descriptor']
for i in G.nodes() if degree_centrality[i] > 0.1}
nx.draw_networkx_labels(G, pos, labels=high_centrality, font_size=8)
plt.title(f'Protocol Similarity Network (threshold={threshold})')
plt.axis('off')
plt.tight_layout()
self.save_plot('network_graph.png')
else:
print("No edges above threshold - try lowering the threshold")
return G
# ========== CLASSIFICATION & PREDICTION ==========
def category_discriminant_analysis(self):
"""Analyze how well dimension categories discriminate protocols."""
print("\n=== Category Discriminant Analysis ===")
results = []
for category_name, columns in [('Design', self.design_cols),
('Entanglement', self.entanglement_cols),
('Experience', self.experience_cols)]:
# Use one category to predict clustering from another
X = self.df[columns].values
# Use kmeans clusters as target if available
if 'kmeans_cluster' in self.df.columns:
y = self.df['kmeans_cluster'].values
# LDA
try:
lda = LinearDiscriminantAnalysis()
lda.fit(X, y)
score = lda.score(X, y)
results.append({
'Category': category_name,
'Accuracy': score,
'N_Dimensions': len(columns)
})
print(f"{category_name} dimensions predict clusters with {score*100:.1f}% accuracy")
except:
print(f"Could not perform LDA for {category_name}")
if results:
results_df = pd.DataFrame(results)
self.save_results(results_df, 'category_discriminant_results.csv')
return results
def feature_importance_analysis(self):
"""Analyze which dimensions are most important for clustering."""
print("\n=== Feature Importance Analysis ===")
if 'kmeans_cluster' not in self.df.columns:
print("Run clustering first to enable feature importance analysis")
return None
# Random Forest classifier
X = self.df[self.dimension_cols].values
y = self.df['kmeans_cluster'].values
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
# Feature importances
importances = pd.DataFrame({
'Dimension': self.dimension_cols,
'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)
self.save_results(importances, 'feature_importances.csv')
# Plot top 20
plt.figure(figsize=(10, 12))
top_20 = importances.head(20)
plt.barh(range(len(top_20)), top_20['Importance'])
plt.yticks(range(len(top_20)), top_20['Dimension'])
plt.xlabel('Importance')
plt.title('Top 20 Most Important Dimensions for Clustering')
plt.gca().invert_yaxis()
plt.tight_layout()
self.save_plot('feature_importances.png')
print("\nTop 10 most important dimensions:")
for _, row in importances.head(10).iterrows():
print(f" {row['Dimension']}: {row['Importance']:.4f}")
return importances
def analyst_comparison(self):
"""Compare ratings across different analysts."""
print("\n=== Analyst Comparison ===")
if 'analyst' not in self.df.columns:
print("No analyst column found")
return None
analysts = self.df['analyst'].unique()
print(f"Found {len(analysts)} unique analysts")
# Mean ratings by analyst for each dimension
analyst_means = self.df.groupby('analyst')[self.dimension_cols].mean()
self.save_results(analyst_means, 'analyst_mean_ratings.csv')
# Plot comparison
fig, axes = plt.subplots(3, 1, figsize=(14, 12))
for idx, (category_name, columns) in enumerate([
('Design', self.design_cols),
('Entanglement', self.entanglement_cols),
('Experience', self.experience_cols)
]):
analyst_means[columns].T.plot(ax=axes[idx], marker='o')
axes[idx].set_title(f'{category_name} Dimensions - Mean Ratings by Analyst')
axes[idx].set_ylabel('Mean Rating')
axes[idx].legend(title='Analyst', bbox_to_anchor=(1.05, 1), loc='upper left')
axes[idx].grid(True, alpha=0.3)
axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=45, ha='right')
plt.tight_layout()
self.save_plot('analyst_comparison.png')
return analyst_means
# ========== SUMMARY REPORT ==========
def generate_summary_report(self):
"""Generate a text summary of all analyses."""
print("\n=== Generating Summary Report ===")
report_lines = []
report_lines.append("=" * 80)
report_lines.append("MULTIVARIATE ANALYSIS SUMMARY REPORT")
report_lines.append("Protocol Bicorder Dataset")
report_lines.append("=" * 80)
report_lines.append("")
report_lines.append(f"Dataset: {self.csv_path}")
report_lines.append(f"Number of protocols: {len(self.df)}")
report_lines.append(f"Number of dimensions: {len(self.dimension_cols)}")
report_lines.append(f" - Design: {len(self.design_cols)}")
report_lines.append(f" - Entanglement: {len(self.entanglement_cols)}")
report_lines.append(f" - Experience: {len(self.experience_cols)}")
report_lines.append("")
report_lines.append("-" * 80)
report_lines.append("ANALYSES PERFORMED")
report_lines.append("-" * 80)
# Check which analyses were run
analyses_run = []
if 'kmeans_cluster' in self.df.columns:
analyses_run.append("- K-Means Clustering")
report_lines.append(f"K-Means: {len(self.df['kmeans_cluster'].unique())} clusters identified")
if 'hierarchical_cluster' in self.df.columns:
analyses_run.append("- Hierarchical Clustering")
report_lines.append(f"Hierarchical: {len(self.df['hierarchical_cluster'].unique())} clusters")
if 'dbscan_cluster' in self.df.columns:
analyses_run.append("- DBSCAN Clustering")
n_outliers = (self.df['dbscan_cluster'] == -1).sum()
report_lines.append(f"DBSCAN: {n_outliers} outlier protocols identified")
report_lines.append("")
report_lines.append("Dimensionality Reduction:")
report_lines.append("- Principal Component Analysis (PCA)")
report_lines.append("- t-SNE Projection")
if UMAP_AVAILABLE:
report_lines.append("- UMAP Projection")
report_lines.append("- Factor Analysis")
report_lines.append("")
report_lines.append("Statistical Analyses:")
report_lines.append("- Correlation Analysis")
report_lines.append("- Network Analysis")
report_lines.append("- Feature Importance Analysis")
if 'analyst' in self.df.columns:
report_lines.append("- Analyst Comparison")
report_lines.append("")
report_lines.append("-" * 80)
report_lines.append("OUTPUT FILES")
report_lines.append("-" * 80)
report_lines.append(f"All results saved to: {self.output_dir}/")
report_lines.append(" - plots/ : All visualizations (PNG)")
report_lines.append(" - data/ : All numerical results (CSV)")
report_lines.append(" - reports/ : This summary report")
report_lines.append("")
report_lines.append("=" * 80)
report_lines.append("END OF REPORT")
report_lines.append("=" * 80)
report_text = "\n".join(report_lines)
# Save report
report_path = self.output_dir / 'reports' / 'analysis_summary.txt'
with open(report_path, 'w') as f:
f.write(report_text)
print(f" Saved: {report_path}")
print("\n" + report_text)
return report_text
def main():
"""Main execution function."""
parser = argparse.ArgumentParser(
description='Multivariate analysis of Protocol Bicorder data',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv
python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv --output data/readings/synthetic_20251116/analysis
python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv --analyses clustering pca
"""
)
parser.add_argument('csv_file', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
parser.add_argument('--output', '-o', default=None,
help='Output directory (default: <dataset_dir>/analysis)')
parser.add_argument('--min-coverage', type=float, default=0.0,
help='Drop dimension columns below this coverage fraction (0.01.0). '
'E.g. 0.8 keeps only columns ≥80%% complete. '
'Useful for sparse/shortform datasets (default: 0.0, keep all)')
parser.add_argument('--analyses', nargs='+',
choices=['clustering', 'pca', 'tsne', 'umap', 'factor',
'correlation', 'network', 'importance', 'analyst', 'all'],
default=['all'],
help='Which analyses to run (default: all)')
args = parser.parse_args()
# Check if file exists
if not Path(args.csv_file).exists():
print(f"Error: File not found: {args.csv_file}")
sys.exit(1)
# Derive output dir from dataset dir if not specified
output_dir = args.output if args.output else str(Path(args.csv_file).parent / 'analysis')
# Initialize analyzer
print("=" * 80)
print("PROTOCOL BICORDER - MULTIVARIATE ANALYSIS")
print("=" * 80)
analyzer = ProtocolAnalyzer(args.csv_file, output_dir, min_coverage=args.min_coverage)
# Determine which analyses to run
run_all = 'all' in args.analyses
# Run analyses
try:
# Clustering
if run_all or 'clustering' in args.analyses:
analyzer.kmeans_clustering()
analyzer.hierarchical_clustering()
analyzer.dbscan_clustering()
# Dimensionality reduction
if run_all or 'pca' in args.analyses:
analyzer.pca_analysis()
if run_all or 'tsne' in args.analyses:
analyzer.tsne_analysis()
if run_all or 'umap' in args.analyses:
analyzer.umap_analysis()
if run_all or 'factor' in args.analyses:
analyzer.factor_analysis()
# Correlation and structure
if run_all or 'correlation' in args.analyses:
analyzer.correlation_analysis()
if run_all or 'network' in args.analyses:
analyzer.network_analysis(threshold=0.6)
# Classification
if run_all or 'importance' in args.analyses:
analyzer.category_discriminant_analysis()
analyzer.feature_importance_analysis()
if run_all or 'analyst' in args.analyses:
analyzer.analyst_comparison()
# Generate summary
analyzer.generate_summary_report()
print("\n" + "=" * 80)
print("ANALYSIS COMPLETE!")
print("=" * 80)
print(f"\nAll results saved to: {analyzer.output_dir}/")
except Exception as e:
print(f"\nError during analysis: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,206 @@
#!/usr/bin/env python3
"""
Comprehensive review of the analysis for errors and inconsistencies.
Usage:
python3 scripts/review_analysis.py data/readings/synthetic_20251116.csv
python3 scripts/review_analysis.py data/readings/manual_20260101.csv --results-dir analysis_results/manual_20260101
"""
import argparse
import pandas as pd
import numpy as np
from pathlib import Path
def main():
parser = argparse.ArgumentParser(
description='Check analysis results for errors and inconsistencies',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Example usage:
python3 scripts/review_analysis.py data/readings/synthetic_20251116/readings.csv
python3 scripts/review_analysis.py data/readings/manual_20260101/readings.csv --analysis-dir data/readings/manual_20260101/analysis
"""
)
parser.add_argument('input_csv', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
parser.add_argument('--analysis-dir', default=None,
help='Analysis directory (default: <dataset_dir>/analysis)')
args = parser.parse_args()
dataset_dir = Path(args.input_csv).parent
results_dir = Path(args.analysis_dir) if args.analysis_dir else dataset_dir / 'analysis'
print("=" * 80)
print("ANALYSIS REVIEW - ERROR CHECKING")
print("=" * 80)
print(f"Dataset: {args.input_csv}")
print(f"Results: {results_dir}")
# Load data
df = pd.read_csv(args.input_csv)
clusters = pd.read_csv(results_dir / 'data' / 'kmeans_clusters.csv')
pca_coords = pd.read_csv(results_dir / 'data' / 'pca_coordinates.csv')
# Identify dimension columns
design_cols = [c for c in df.columns if c.startswith('Design_')]
entanglement_cols = [c for c in df.columns if c.startswith('Entanglement_')]
experience_cols = [c for c in df.columns if c.startswith('Experience_')]
dimension_cols = design_cols + entanglement_cols + experience_cols
errors_found = []
warnings_found = []
print("\n1. DATA COMPLETENESS CHECK")
print("-" * 80)
missing_count = df[dimension_cols].isna().sum().sum()
rows_with_missing = df[dimension_cols].isna().any(axis=1).sum()
print(f"✓ Total protocols in source data: {len(df)}")
print(f"✓ Protocols with complete data: {len(df) - rows_with_missing}")
print(f"✓ Protocols with missing values: {rows_with_missing}")
print(f"✓ Protocols in cluster analysis: {len(clusters)}")
if rows_with_missing > 0:
warnings_found.append(f"{rows_with_missing} protocols excluded due to missing values")
missing_protocols = df[df[dimension_cols].isna().any(axis=1)]['Descriptor'].tolist()
print(f"\n Excluded protocols: {', '.join(missing_protocols)}")
merged = df.merge(clusters, on='Descriptor', how='inner')
if len(merged) != len(clusters):
errors_found.append(f"Descriptor mismatch: {len(merged)} matched vs {len(clusters)} expected")
else:
print(f"✓ All cluster descriptors match source data")
print("\n2. DATA QUALITY CHECK")
print("-" * 80)
for col in dimension_cols:
values = df[col].dropna()
if values.min() < 1 or values.max() > 9:
errors_found.append(f"Column {col} has out-of-range values: [{values.min()}, {values.max()}]")
print(f"✓ All dimension values within expected range [1, 9]")
df_clean = df.dropna(subset=dimension_cols)
variances = df_clean[dimension_cols].var()
low_var_dims = variances[variances < 1.0]
if len(low_var_dims) > 0:
warnings_found.append(f"{len(low_var_dims)} dimensions have very low variance (< 1.0)")
print(f"\n Low variance dimensions:")
for dim, var in low_var_dims.items():
print(f" - {dim}: {var:.3f}")
else:
print(f"✓ All dimensions have reasonable variance")
print("\n3. CLUSTERING VALIDATION")
print("-" * 80)
cluster_sizes = clusters['cluster'].value_counts().sort_index()
print(f"✓ Cluster 1: {cluster_sizes[1]} protocols ({cluster_sizes[1]/len(clusters)*100:.1f}%)")
print(f"✓ Cluster 2: {cluster_sizes[2]} protocols ({cluster_sizes[2]/len(clusters)*100:.1f}%)")
imbalance_ratio = max(cluster_sizes) / min(cluster_sizes)
if imbalance_ratio > 2.0:
warnings_found.append(f"Cluster imbalance ratio is {imbalance_ratio:.2f} (ideally < 2.0)")
if len(pca_coords) != len(clusters):
errors_found.append(f"PCA coordinates count ({len(pca_coords)}) != cluster count ({len(clusters)})")
else:
print(f"✓ PCA coordinates match cluster count")
pca_loadings = pd.read_csv(results_dir / 'data' / 'pca_loadings.csv', index_col=0)
if pca_loadings.shape[0] != 23:
errors_found.append(f"PCA loadings have {pca_loadings.shape[0]} rows, expected 23")
else:
print(f"✓ PCA loadings have correct dimensions")
print("\n4. STATISTICAL VALIDITY")
print("-" * 80)
corr_matrix = pd.read_csv(results_dir / 'data' / 'correlation_matrix.csv', index_col=0)
np.fill_diagonal(corr_matrix.values, 0)
perfect_corrs = np.where(np.abs(corr_matrix.values) > 0.99)
if len(perfect_corrs[0]) > 0:
warnings_found.append(f"Found {len(perfect_corrs[0])} near-perfect correlations between dimensions")
else:
print(f"✓ No perfect correlations found (multicollinearity check)")
try:
if corr_matrix.shape[0] == corr_matrix.shape[1]:
if not np.allclose(corr_matrix.values, corr_matrix.values.T, equal_nan=True):
errors_found.append("Correlation matrix is not symmetric")
else:
print(f"✓ Correlation matrix is symmetric")
else:
errors_found.append(f"Correlation matrix is not square: {corr_matrix.shape}")
except Exception as e:
warnings_found.append(f"Could not verify correlation matrix symmetry: {e}")
print("\n5. AVERAGE VALUES CHECK")
print("-" * 80)
df_clean = df.dropna(subset=dimension_cols)
calculated_averages = df_clean[dimension_cols].mean(axis=1)
print(f"✓ Average values range: [{calculated_averages.min():.2f}, {calculated_averages.max():.2f}]")
print(f"✓ Mean of averages: {calculated_averages.mean():.2f}")
print(f"✓ Std of averages: {calculated_averages.std():.2f}")
from scipy import stats
bins = np.arange(int(calculated_averages.min()), int(calculated_averages.max()) + 1, 0.5)
observed_counts, _ = np.histogram(calculated_averages, bins=bins)
expected_count = len(calculated_averages) / len(bins[:-1])
chi2_stat = np.sum((observed_counts - expected_count)**2 / expected_count)
p_value = 1 - stats.chi2.cdf(chi2_stat, len(bins) - 2)
print(f"✓ Distribution uniformity test p-value: {p_value:.4f}")
if p_value < 0.05:
print(f" (Distribution is significantly non-uniform, as expected for real data)")
else:
warnings_found.append("Average values may be too uniformly distributed (p > 0.05)")
print("\n6. CLUSTER SEPARATION CHECK")
print("-" * 80)
merged = df_clean.merge(clusters, on='Descriptor')
cluster1_means = merged[merged['cluster'] == 1][dimension_cols].mean()
cluster2_means = merged[merged['cluster'] == 2][dimension_cols].mean()
differences = (cluster1_means - cluster2_means).abs()
significant_diffs = differences[differences > 0.5]
print(f"✓ Dimensions with meaningful difference (>0.5) between clusters: {len(significant_diffs)}/23")
if len(significant_diffs) < 5:
warnings_found.append(f"Only {len(significant_diffs)} dimensions show meaningful separation between clusters")
print(f"\n Top 5 differentiating dimensions:")
for dim in differences.nlargest(5).index:
print(f" - {dim}: {differences[dim]:.3f}")
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
if len(errors_found) == 0:
print("✓ No critical errors found!")
else:
print(f"{len(errors_found)} CRITICAL ERROR(S) FOUND:")
for i, error in enumerate(errors_found, 1):
print(f" {i}. {error}")
if len(warnings_found) == 0:
print("✓ No warnings!")
else:
print(f"\n{len(warnings_found)} WARNING(S):")
for i, warning in enumerate(warnings_found, 1):
print(f" {i}. {warning}")
print("\n" + "=" * 80)
print("REVIEW COMPLETE")
print("=" * 80)
if __name__ == '__main__':
main()

107
analysis/scripts/sync_readings.sh Executable file
View File

@@ -0,0 +1,107 @@
#!/usr/bin/env bash
# Sync a readings dataset from a remote git repository, then regenerate CSV and analysis.
#
# Reads remote URL and subdirectory from a .sync_source file in the dataset directory.
#
# Usage:
# scripts/sync_readings.sh data/readings/manual_20260320
# scripts/sync_readings.sh data/readings/manual_20260320 --no-analysis
# scripts/sync_readings.sh data/readings/manual_20260320 --min-coverage 0.8
# scripts/sync_readings.sh data/readings/manual_20260320 --training data/readings/synthetic_20251116/readings.csv
#
# .sync_source format:
# REMOTE_URL=https://git.example.org/user/repo
# REMOTE_SUBDIR=readings
set -euo pipefail
DATASET_DIR="${1:?Usage: $0 <dataset_dir> [--no-analysis] [--min-coverage N]}"
RUN_ANALYSIS=true
MIN_COVERAGE=0.8
TRAINING_CSV="data/readings/synthetic_20251116/readings.csv"
shift || true
while [[ $# -gt 0 ]]; do
case "$1" in
--no-analysis) RUN_ANALYSIS=false ;;
--min-coverage) MIN_COVERAGE="$2"; shift ;;
--training) TRAINING_CSV="$2"; shift ;;
*) echo "Unknown option: $1" >&2; exit 1 ;;
esac
shift
done
SYNC_SOURCE="$DATASET_DIR/.sync_source"
if [[ ! -f "$SYNC_SOURCE" ]]; then
echo "Error: $SYNC_SOURCE not found. Create it with REMOTE_URL and REMOTE_SUBDIR." >&2
exit 1
fi
# Load config
REMOTE_URL=$(grep '^REMOTE_URL=' "$SYNC_SOURCE" | cut -d= -f2-)
REMOTE_SUBDIR=$(grep '^REMOTE_SUBDIR=' "$SYNC_SOURCE" | cut -d= -f2-)
if [[ -z "$REMOTE_URL" ]]; then
echo "Error: REMOTE_URL not set in $SYNC_SOURCE" >&2
exit 1
fi
REMOTE_SUBDIR="${REMOTE_SUBDIR:-readings}"
JSON_DIR="$DATASET_DIR/json"
echo "========================================"
echo "Syncing: $DATASET_DIR"
echo "From: $REMOTE_URL/$REMOTE_SUBDIR"
echo "========================================"
# Clone remote to temp dir and copy JSON files
TMPDIR=$(mktemp -d)
trap "rm -rf '$TMPDIR'" EXIT
echo ""
echo "Fetching remote data..."
git clone --depth 1 --quiet "$REMOTE_URL" "$TMPDIR"
SRC="$TMPDIR/$REMOTE_SUBDIR"
if [[ ! -d "$SRC" ]]; then
echo "Error: subdirectory '$REMOTE_SUBDIR' not found in remote repo." >&2
exit 1
fi
NEW=$(find "$SRC" -name '*.json' | wc -l | tr -d ' ')
mkdir -p "$JSON_DIR"
cp "$SRC"/*.json "$JSON_DIR"/
echo "Copied $NEW JSON files → $JSON_DIR"
# Determine VENV python
PYTHON=python3
if [[ -f ".venv/bin/python3" ]]; then
PYTHON=".venv/bin/python3"
fi
# Regenerate CSV
echo ""
echo "Regenerating readings.csv..."
"$PYTHON" scripts/json_to_csv.py "$JSON_DIR" -o "$DATASET_DIR/readings.csv"
if [[ "$RUN_ANALYSIS" == true ]]; then
echo ""
echo "Running multivariate analysis (--min-coverage $MIN_COVERAGE)..."
"$PYTHON" scripts/multivariate_analysis.py \
"$DATASET_DIR/readings.csv" \
--min-coverage "$MIN_COVERAGE" \
--analyses clustering pca correlation importance
echo ""
echo "Generating LDA visualization..."
"$PYTHON" scripts/lda_visualization.py "$DATASET_DIR/readings.csv"
echo ""
echo "Classifying readings (training: $TRAINING_CSV)..."
"$PYTHON" scripts/classify_readings.py \
"$DATASET_DIR/readings.csv" \
--training "$TRAINING_CSV"
fi
echo ""
echo "Done. Dataset: $DATASET_DIR"

View File

@@ -0,0 +1,167 @@
#!/usr/bin/env python3
"""
Create visualizations of k-means clusters overlaid on dimensionality reduction plots.
Usage:
python3 scripts/visualize_clusters.py data/readings/synthetic_20251116.csv
python3 scripts/visualize_clusters.py data/readings/manual_20260101.csv --results-dir analysis_results/manual_20260101
"""
import argparse
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
def main():
parser = argparse.ArgumentParser(
description='Visualize k-means clusters in PCA/t-SNE/UMAP space',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Example usage:
python3 scripts/visualize_clusters.py data/readings/synthetic_20251116/readings.csv
python3 scripts/visualize_clusters.py data/readings/manual_20260101/readings.csv --analysis-dir data/readings/manual_20260101/analysis
"""
)
parser.add_argument('input_csv', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
parser.add_argument('--analysis-dir', default=None,
help='Analysis directory (default: <dataset_dir>/analysis)')
args = parser.parse_args()
dataset_dir = Path(args.input_csv).parent
results_dir = Path(args.analysis_dir) if args.analysis_dir else dataset_dir / 'analysis'
plots_dir = results_dir / 'plots'
data_dir = results_dir / 'data'
# Load cluster assignments
clusters = pd.read_csv(data_dir / 'kmeans_clusters.csv')
clusters['cluster'] = clusters['cluster'] # Already 1-indexed
# Load dimensionality reduction coordinates
pca_coords = pd.read_csv(data_dir / 'pca_coordinates.csv')
tsne_coords = pd.read_csv(data_dir / 'tsne_coordinates.csv')
# Merge cluster assignments with coordinates
pca_data = pca_coords.merge(clusters, on='Descriptor')
tsne_data = tsne_coords.merge(clusters, on='Descriptor')
# Set up color scheme
colors = {1: '#2E86AB', 2: '#A23B72'} # Blue for cluster 1, Purple for cluster 2
cluster_names = {1: 'Cluster 1: Relational/Cultural', 2: 'Cluster 2: Institutional/Bureaucratic'}
# ========== PCA Plot with Clusters ==========
print("Creating PCA plot with cluster colors...")
fig, ax = plt.subplots(figsize=(14, 12))
for cluster_id in [1, 2]:
cluster_data = pca_data[pca_data['cluster'] == cluster_id]
ax.scatter(cluster_data['PC1'], cluster_data['PC2'],
c=colors[cluster_id], label=cluster_names[cluster_id],
alpha=0.6, s=60, edgecolors='white', linewidth=0.5)
for cluster_id in [1, 2]:
cluster_data = pca_data[pca_data['cluster'] == cluster_id]
for i, row in cluster_data.iterrows():
if i % 8 == 0:
ax.annotate(row['Descriptor'],
(row['PC1'], row['PC2']),
fontsize=7, alpha=0.7,
xytext=(5, 5), textcoords='offset points')
ax.set_xlabel('PC1 (22.5% variance)', fontsize=12)
ax.set_ylabel('PC2 (22.7% variance)', fontsize=12)
ax.set_title('K-Means Clusters in PCA Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=10, framealpha=0.9)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(plots_dir / 'pca_2d_clustered.png', dpi=300, bbox_inches='tight')
print(f" Saved: {plots_dir / 'pca_2d_clustered.png'}")
plt.close()
# ========== t-SNE Plot with Clusters ==========
print("Creating t-SNE plot with cluster colors...")
fig, ax = plt.subplots(figsize=(14, 12))
for cluster_id in [1, 2]:
cluster_data = tsne_data[tsne_data['cluster'] == cluster_id]
ax.scatter(cluster_data['TSNE1'], cluster_data['TSNE2'],
c=colors[cluster_id], label=cluster_names[cluster_id],
alpha=0.6, s=60, edgecolors='white', linewidth=0.5)
for cluster_id in [1, 2]:
cluster_data = tsne_data[tsne_data['cluster'] == cluster_id]
for i, row in cluster_data.iterrows():
if i % 8 == 0:
ax.annotate(row['Descriptor'],
(row['TSNE1'], row['TSNE2']),
fontsize=7, alpha=0.7,
xytext=(5, 5), textcoords='offset points')
ax.set_xlabel('t-SNE Dimension 1', fontsize=12)
ax.set_ylabel('t-SNE Dimension 2', fontsize=12)
ax.set_title('K-Means Clusters in t-SNE Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=10, framealpha=0.9)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(plots_dir / 'tsne_2d_clustered.png', dpi=300, bbox_inches='tight')
print(f" Saved: {plots_dir / 'tsne_2d_clustered.png'}")
plt.close()
# ========== UMAP Plot with Clusters (if available) ==========
umap_path = data_dir / 'umap_coordinates.csv'
if umap_path.exists():
print("Creating UMAP plot with cluster colors...")
umap_coords = pd.read_csv(umap_path)
umap_data = umap_coords.merge(clusters, on='Descriptor')
fig, ax = plt.subplots(figsize=(14, 12))
for cluster_id in [1, 2]:
cluster_data = umap_data[umap_data['cluster'] == cluster_id]
ax.scatter(cluster_data['UMAP1'], cluster_data['UMAP2'],
c=colors[cluster_id], label=cluster_names[cluster_id],
alpha=0.6, s=60, edgecolors='white', linewidth=0.5)
for cluster_id in [1, 2]:
cluster_data = umap_data[umap_data['cluster'] == cluster_id]
for i, row in cluster_data.iterrows():
if i % 8 == 0:
ax.annotate(row['Descriptor'],
(row['UMAP1'], row['UMAP2']),
fontsize=7, alpha=0.7,
xytext=(5, 5), textcoords='offset points')
ax.set_xlabel('UMAP Dimension 1', fontsize=12)
ax.set_ylabel('UMAP Dimension 2', fontsize=12)
ax.set_title('K-Means Clusters in UMAP Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=10, framealpha=0.9)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(plots_dir / 'umap_2d_clustered.png', dpi=300, bbox_inches='tight')
print(f" Saved: {plots_dir / 'umap_2d_clustered.png'}")
plt.close()
# ========== Summary Statistics ==========
print("\n=== Cluster Summary ===")
print(f"Total protocols: {len(clusters)}")
print(f"\nCluster 1 (Relational/Cultural): {len(clusters[clusters['cluster'] == 1])} protocols")
print(f"Cluster 2 (Institutional/Bureaucratic): {len(clusters[clusters['cluster'] == 2])} protocols")
print("\nSample protocols from each cluster:")
print("\nCluster 1 (Relational/Cultural):")
for protocol in clusters[clusters['cluster'] == 1]['Descriptor'].head(10):
print(f" - {protocol}")
print("\nCluster 2 (Institutional/Bureaucratic):")
for protocol in clusters[clusters['cluster'] == 2]['Descriptor'].head(10):
print(f" - {protocol}")
print("\n=== Visualization Complete! ===")
if __name__ == '__main__':
main()