Reorganize directory, add manual dataset and sync tooling
- Move all scripts to scripts/, web assets to web/, analysis results into self-contained data/readings/<type>_<YYYYMMDD>/ directories - Add data/readings/manual_20260320/ with 32 JSON readings from git.medlab.host/ntnsndr/protocol-bicorder-data - Add scripts/json_to_csv.py to convert bicorder JSON files to CSV - Add scripts/sync_readings.sh for one-command sync + re-analysis of any dataset backed by a .sync_source config file - Add scripts/classify_readings.py to apply the LDA classifier to all readings and save per-reading cluster assignments - Add --min-coverage flag to multivariate_analysis.py for sparse/shortform datasets; also applies in lda_visualization.py - Fix lda_visualization.py NaN handling and 0-d array annotation bug - Update README.md and WORKFLOW.md to document datasets, sync workflow, shortform handling, and new scripts Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
BIN
analysis/scripts/__pycache__/bicorder_classifier.cpython-314.pyc
Normal file
BIN
analysis/scripts/__pycache__/bicorder_classifier.cpython-314.pyc
Normal file
Binary file not shown.
155
analysis/scripts/bicorder_analyze.py
Normal file
155
analysis/scripts/bicorder_analyze.py
Normal file
@@ -0,0 +1,155 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Protocol Bicorder Analysis Script
|
||||
|
||||
Processes a two-column CSV file (protocol descriptor and description) and adds
|
||||
columns for each diagnostic gradient from bicorder.json. Values to be filled
|
||||
by LLM commands.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_bicorder_config(bicorder_path):
|
||||
"""Load and parse the bicorder.json configuration file."""
|
||||
with open(bicorder_path, 'r') as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def extract_gradients(bicorder_data):
|
||||
"""Extract all gradients from the diagnostic sets."""
|
||||
gradients = []
|
||||
for diagnostic_set in bicorder_data['diagnostic']:
|
||||
set_name = diagnostic_set['set_name']
|
||||
|
||||
for gradient in diagnostic_set['gradients']:
|
||||
# Create a unique column name for this gradient
|
||||
col_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
|
||||
gradients.append({
|
||||
'column_name': col_name,
|
||||
'set_name': set_name,
|
||||
'term_left': gradient['term_left'],
|
||||
'term_left_description': gradient['term_left_description'],
|
||||
'term_right': gradient['term_right'],
|
||||
'term_right_description': gradient['term_right_description']
|
||||
})
|
||||
|
||||
return gradients
|
||||
|
||||
|
||||
def process_csv(input_csv, output_csv, bicorder_path, analyst=None, standpoint=None):
|
||||
"""
|
||||
Process the input CSV and add gradient columns.
|
||||
|
||||
Args:
|
||||
input_csv: Path to input CSV file
|
||||
output_csv: Path to output CSV file
|
||||
bicorder_path: Path to bicorder.json file
|
||||
analyst: Optional analyst name
|
||||
standpoint: Optional standpoint description
|
||||
"""
|
||||
# Load bicorder configuration
|
||||
bicorder_data = load_bicorder_config(bicorder_path)
|
||||
gradients = extract_gradients(bicorder_data)
|
||||
|
||||
with open(input_csv, 'r', encoding='utf-8') as infile, \
|
||||
open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
|
||||
|
||||
reader = csv.DictReader(infile)
|
||||
|
||||
# Get original fieldnames from input CSV, filter out None/empty
|
||||
original_fields = [f for f in reader.fieldnames if f and f.strip()]
|
||||
|
||||
# Add gradient columns and metadata columns
|
||||
gradient_columns = [g['column_name'] for g in gradients]
|
||||
output_fields = list(original_fields) + gradient_columns
|
||||
|
||||
# Add metadata columns if provided
|
||||
if analyst is not None:
|
||||
output_fields.append('analyst')
|
||||
if standpoint is not None:
|
||||
output_fields.append('standpoint')
|
||||
|
||||
writer = csv.DictWriter(outfile, fieldnames=output_fields)
|
||||
writer.writeheader()
|
||||
|
||||
# Process each protocol row
|
||||
row_count = 0
|
||||
for protocol_row in reader:
|
||||
# Start with original row data, filter out None keys
|
||||
output_row = {k: v for k, v in protocol_row.items() if k and k.strip()}
|
||||
|
||||
# Initialize all gradient columns as empty (to be filled by LLM)
|
||||
for gradient in gradients:
|
||||
output_row[gradient['column_name']] = ''
|
||||
|
||||
# Add metadata if provided
|
||||
if analyst is not None:
|
||||
output_row['analyst'] = analyst
|
||||
if standpoint is not None:
|
||||
output_row['standpoint'] = standpoint
|
||||
|
||||
writer.writerow(output_row)
|
||||
row_count += 1
|
||||
|
||||
descriptor = protocol_row.get('Descriptor', '').strip()
|
||||
print(f"Processed protocol {row_count}: {descriptor}")
|
||||
|
||||
print(f"\nOutput written to: {output_csv}")
|
||||
print(f"Total protocols: {row_count}")
|
||||
print(f"Gradient columns added: {len(gradients)}")
|
||||
print(f"\nGradient columns:")
|
||||
for i, gradient in enumerate(gradients, 1):
|
||||
print(f" {i}. {gradient['column_name']}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Process protocol CSV and add bicorder diagnostic columns',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Example usage:
|
||||
python3 bicorder_analyze.py protocols_edited.csv -o output.csv
|
||||
python3 bicorder_analyze.py protocols_raw.csv -o output.csv -a "Jane Doe" -s "Researcher perspective"
|
||||
|
||||
The script will preserve all original columns and add one column per diagnostic gradient.
|
||||
Each gradient column will be empty, ready to be filled by LLM commands.
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('input_csv', help='Input CSV file with protocol data')
|
||||
parser.add_argument('-o', '--output', required=True, help='Output CSV file')
|
||||
parser.add_argument('-b', '--bicorder',
|
||||
default='../bicorder.json',
|
||||
help='Path to bicorder.json (default: ../bicorder.json)')
|
||||
parser.add_argument('-a', '--analyst', help='Analyst name (adds analyst column)')
|
||||
parser.add_argument('-s', '--standpoint', help='Analyst standpoint (adds standpoint column)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate input file exists
|
||||
if not Path(args.input_csv).exists():
|
||||
print(f"Error: Input file '{args.input_csv}' not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Validate bicorder.json exists
|
||||
if not Path(args.bicorder).exists():
|
||||
print(f"Error: Bicorder config '{args.bicorder}' not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Process the CSV
|
||||
process_csv(
|
||||
args.input_csv,
|
||||
args.output,
|
||||
args.bicorder,
|
||||
args.analyst,
|
||||
args.standpoint
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
175
analysis/scripts/bicorder_batch.py
Normal file
175
analysis/scripts/bicorder_batch.py
Normal file
@@ -0,0 +1,175 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Batch process all protocols in a CSV using the Bicorder framework.
|
||||
|
||||
This script orchestrates the entire analysis workflow:
|
||||
1. Creates output CSV with gradient columns
|
||||
2. For each protocol row:
|
||||
- Queries all 23 gradients (each in a new chat)
|
||||
- Updates CSV with results
|
||||
"""
|
||||
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
import argparse
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def count_csv_rows(csv_path):
|
||||
"""Count the number of data rows in a CSV file."""
|
||||
with open(csv_path, 'r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
return sum(1 for _ in reader)
|
||||
|
||||
|
||||
def run_bicorder_analyze(input_csv, output_csv, bicorder_path, analyst=None, standpoint=None):
|
||||
"""Run bicorder_analyze.py to create output CSV."""
|
||||
cmd = ['python3', str(Path(__file__).parent / 'bicorder_analyze.py'), input_csv, '-o', output_csv, '-b', bicorder_path]
|
||||
|
||||
if analyst:
|
||||
cmd.extend(['-a', analyst])
|
||||
if standpoint:
|
||||
cmd.extend(['-s', standpoint])
|
||||
|
||||
print(f"Creating analysis CSV: {output_csv}")
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"Error creating CSV: {result.stderr}", file=sys.stderr)
|
||||
return False
|
||||
|
||||
print(result.stdout)
|
||||
return True
|
||||
|
||||
|
||||
def query_gradients(output_csv, row_num, bicorder_path, model=None):
|
||||
"""Query all gradients for a protocol row."""
|
||||
cmd = ['python3', str(Path(__file__).parent / 'bicorder_query.py'), output_csv, str(row_num),
|
||||
'-b', bicorder_path]
|
||||
|
||||
if model:
|
||||
cmd.extend(['-m', model])
|
||||
|
||||
print(f"Starting gradient queries...")
|
||||
|
||||
# Don't capture output - let it print in real-time for progress visibility
|
||||
result = subprocess.run(cmd)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"Error querying gradients", file=sys.stderr)
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def process_protocol_row(input_csv, output_csv, row_num, total_rows, bicorder_path, model=None):
|
||||
"""Process a single protocol row through the complete workflow."""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Row {row_num}/{total_rows}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Query all gradients (each gradient gets a new chat)
|
||||
if not query_gradients(output_csv, row_num, bicorder_path, model):
|
||||
print(f"[FAILED] Could not query gradients")
|
||||
return False
|
||||
|
||||
print(f"✓ Row {row_num} complete")
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Batch process protocols through Bicorder analysis (each gradient uses a new chat)',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Example usage:
|
||||
# Process all protocols
|
||||
python3 bicorder_batch.py data/readings/synthetic_20251116/protocols_edited.csv -o data/readings/synthetic_20251116/readings.csv
|
||||
|
||||
# Process specific rows
|
||||
python3 bicorder_batch.py data/readings/synthetic_20251116/protocols_edited.csv -o data/readings/synthetic_20251116/readings.csv --start 1 --end 5
|
||||
|
||||
# With specific model
|
||||
python3 bicorder_batch.py data/readings/synthetic_20251116/protocols_edited.csv -o data/readings/synthetic_20251116/readings.csv -m mistral
|
||||
|
||||
# With metadata
|
||||
python3 bicorder_batch.py data/readings/synthetic_20251116/protocols_edited.csv -o data/readings/synthetic_20251116/readings.csv -a "Your Name" -s "Your standpoint"
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('input_csv', help='Input CSV file with protocol data')
|
||||
parser.add_argument('-o', '--output', required=True, help='Output CSV file')
|
||||
parser.add_argument('-b', '--bicorder',
|
||||
default='../bicorder.json',
|
||||
help='Path to bicorder.json (default: ../bicorder.json)')
|
||||
parser.add_argument('-m', '--model', help='LLM model to use')
|
||||
parser.add_argument('-a', '--analyst', help='Analyst name')
|
||||
parser.add_argument('-s', '--standpoint', help='Analyst standpoint')
|
||||
parser.add_argument('--start', type=int, default=1,
|
||||
help='Start row number (1-indexed, default: 1)')
|
||||
parser.add_argument('--end', type=int,
|
||||
help='End row number (1-indexed, default: all rows)')
|
||||
parser.add_argument('--resume', action='store_true',
|
||||
help='Resume from existing output CSV (skip rows with values)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate input file exists
|
||||
if not Path(args.input_csv).exists():
|
||||
print(f"Error: Input file '{args.input_csv}' not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Validate bicorder.json exists
|
||||
if not Path(args.bicorder).exists():
|
||||
print(f"Error: Bicorder config '{args.bicorder}' not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Count rows in input CSV
|
||||
total_rows = count_csv_rows(args.input_csv)
|
||||
end_row = args.end if args.end else total_rows
|
||||
|
||||
if args.start > total_rows or end_row > total_rows:
|
||||
print(f"Error: Row range exceeds CSV size ({total_rows} rows)", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Bicorder Batch Analysis")
|
||||
print(f"Input: {args.input_csv} ({total_rows} protocols)")
|
||||
print(f"Output: {args.output}")
|
||||
print(f"Processing rows: {args.start} to {end_row}")
|
||||
if args.model:
|
||||
print(f"Model: {args.model}")
|
||||
print()
|
||||
|
||||
# Step 1: Create output CSV (unless resuming)
|
||||
if not args.resume or not Path(args.output).exists():
|
||||
if not run_bicorder_analyze(args.input_csv, args.output, args.bicorder,
|
||||
args.analyst, args.standpoint):
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f"Resuming from existing CSV: {args.output}")
|
||||
|
||||
# Step 2: Process each protocol row
|
||||
success_count = 0
|
||||
fail_count = 0
|
||||
|
||||
for row_num in range(args.start, end_row + 1):
|
||||
if process_protocol_row(args.input_csv, args.output, row_num, end_row,
|
||||
args.bicorder, args.model):
|
||||
success_count += 1
|
||||
else:
|
||||
fail_count += 1
|
||||
print(f"[WARNING] Row {row_num} failed, continuing...")
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"BATCH COMPLETE")
|
||||
print(f"{'='*60}")
|
||||
print(f"Successful: {success_count}")
|
||||
print(f"Failed: {fail_count}")
|
||||
print(f"Output: {args.output}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
364
analysis/scripts/bicorder_classifier.py
Normal file
364
analysis/scripts/bicorder_classifier.py
Normal file
@@ -0,0 +1,364 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Bicorder Cluster Classifier
|
||||
|
||||
Provides real-time protocol classification and smart form recommendation
|
||||
based on the two-cluster analysis.
|
||||
|
||||
Usage:
|
||||
from bicorder_classifier import BicorderClassifier
|
||||
|
||||
classifier = BicorderClassifier()
|
||||
|
||||
# As user fills in dimensions
|
||||
ratings = {
|
||||
'Design_explicit_vs_implicit': 7,
|
||||
'Design_elite_vs_vernacular': 2,
|
||||
# ... etc
|
||||
}
|
||||
|
||||
result = classifier.predict(ratings)
|
||||
print(f"Cluster: {result['cluster']}")
|
||||
print(f"Confidence: {result['confidence']:.1%}")
|
||||
print(f"Recommend form: {result['recommended_form']}")
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# Path to bicorder.json (relative to this script)
|
||||
_BICORDER_JSON = Path(__file__).parent.parent.parent / 'bicorder.json'
|
||||
|
||||
# Historical column renames: maps old CSV column names → current bicorder.json names.
|
||||
# Add an entry here whenever gradient terms are renamed in bicorder.json.
|
||||
_COLUMN_RENAMES = {
|
||||
'Design_elite_vs_vernacular': 'Design_institutional_vs_vernacular',
|
||||
'Entanglement_exclusive_vs_non-exclusive': 'Entanglement_monopolistic_vs_pluralistic',
|
||||
'Experience_sufficient_vs_insufficient': 'Experience_sufficient_vs_limited',
|
||||
'Experience_Kafka_vs_Whitehead': 'Experience_restraining_vs_liberating',
|
||||
}
|
||||
|
||||
|
||||
def _load_bicorder_dimensions(bicorder_path=_BICORDER_JSON):
|
||||
"""Read DIMENSIONS and KEY_DIMENSIONS from bicorder.json."""
|
||||
with open(bicorder_path) as f:
|
||||
data = json.load(f)
|
||||
dimensions = []
|
||||
key_dimensions = []
|
||||
for category in data['diagnostic']:
|
||||
set_name = category['set_name']
|
||||
for gradient in category['gradients']:
|
||||
dim_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
|
||||
dimensions.append(dim_name)
|
||||
if gradient.get('shortform', False):
|
||||
key_dimensions.append(dim_name)
|
||||
return dimensions, key_dimensions
|
||||
|
||||
|
||||
class BicorderClassifier:
|
||||
"""
|
||||
Classifies protocols into one of two families and recommends form type.
|
||||
"""
|
||||
|
||||
# Cluster names
|
||||
CLUSTER_NAMES = {
|
||||
1: "Relational/Cultural",
|
||||
2: "Institutional/Bureaucratic"
|
||||
}
|
||||
|
||||
def __init__(self, diagnostic_csv='data/readings/synthetic_20251116/readings.csv',
|
||||
model_path=None):
|
||||
"""Initialize classifier with pre-computed model data."""
|
||||
if model_path is None:
|
||||
model_path = str(Path(diagnostic_csv).parent / 'analysis' / 'data')
|
||||
self._diagnostic_csv = diagnostic_csv
|
||||
self.model_path = Path(model_path)
|
||||
self.scaler = StandardScaler()
|
||||
self.lda = None
|
||||
self.cluster_centroids = None
|
||||
|
||||
# Derive dimension lists from bicorder.json
|
||||
self.DIMENSIONS, self.KEY_DIMENSIONS = _load_bicorder_dimensions()
|
||||
|
||||
# Load training data to fit scaler and LDA
|
||||
self._load_model()
|
||||
|
||||
def _load_model(self):
|
||||
"""Load and fit the classification model from analysis results."""
|
||||
# Load the original data and cluster assignments
|
||||
df = pd.read_csv(self._diagnostic_csv)
|
||||
clusters = pd.read_csv(self.model_path / 'kmeans_clusters.csv')
|
||||
|
||||
# Rename old column names to match current bicorder.json
|
||||
df = df.rename(columns=_COLUMN_RENAMES)
|
||||
|
||||
# Remove duplicates
|
||||
df = df.drop_duplicates(subset='Descriptor', keep='first')
|
||||
|
||||
# Merge and clean
|
||||
merged = df.merge(clusters, on='Descriptor')
|
||||
merged_clean = merged.dropna(subset=self.DIMENSIONS)
|
||||
|
||||
# Prepare training data
|
||||
X = merged_clean[self.DIMENSIONS].values
|
||||
y = merged_clean['cluster'].values
|
||||
|
||||
# Fit scaler
|
||||
self.scaler.fit(X)
|
||||
X_scaled = self.scaler.transform(X)
|
||||
|
||||
# Fit LDA
|
||||
self.lda = LinearDiscriminantAnalysis(n_components=1)
|
||||
self.lda.fit(X_scaled, y)
|
||||
|
||||
# Calculate cluster centroids in scaled space
|
||||
self.cluster_centroids = {}
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = X_scaled[y == cluster_id]
|
||||
self.cluster_centroids[cluster_id] = cluster_data.mean(axis=0)
|
||||
|
||||
def predict(self, ratings, return_details=True):
|
||||
"""
|
||||
Predict cluster for given ratings.
|
||||
|
||||
Args:
|
||||
ratings: Dict mapping dimension names to values (1-9)
|
||||
Can be partial - missing dimensions are filled with median
|
||||
return_details: If True, returns detailed information
|
||||
|
||||
Returns:
|
||||
Dict with:
|
||||
- cluster: Predicted cluster number (1 or 2)
|
||||
- cluster_name: Human-readable cluster name
|
||||
- confidence: Confidence score (0-1)
|
||||
- completeness: Fraction of dimensions provided (0-1)
|
||||
- recommended_form: 'short' or 'long'
|
||||
- distance_to_boundary: How far from cluster boundary
|
||||
- lda_score: Score on the discriminant axis
|
||||
"""
|
||||
# Convert ratings to full vector
|
||||
X = np.full(len(self.DIMENSIONS), np.nan)
|
||||
provided_count = 0
|
||||
|
||||
for i, dim in enumerate(self.DIMENSIONS):
|
||||
if dim in ratings:
|
||||
X[i] = ratings[dim]
|
||||
provided_count += 1
|
||||
|
||||
completeness = provided_count / len(self.DIMENSIONS)
|
||||
|
||||
# Fill missing values with median (5 - middle of 1-9 scale)
|
||||
X[np.isnan(X)] = 5.0
|
||||
|
||||
# Scale
|
||||
X_scaled = self.scaler.transform(X.reshape(1, -1))
|
||||
|
||||
# Predict cluster
|
||||
cluster = self.lda.predict(X_scaled)[0]
|
||||
|
||||
# Get LDA score (position on discriminant axis)
|
||||
lda_score = self.lda.decision_function(X_scaled)[0]
|
||||
|
||||
# Calculate confidence based on distance from decision boundary
|
||||
# LDA decision boundary is at 0
|
||||
distance_to_boundary = abs(lda_score)
|
||||
|
||||
# Confidence: higher when further from boundary
|
||||
# Normalize based on observed data range
|
||||
confidence = min(1.0, distance_to_boundary / 3.0) # 3.0 is typical strong separation
|
||||
|
||||
# Adjust confidence based on completeness
|
||||
adjusted_confidence = confidence * (0.5 + 0.5 * completeness)
|
||||
|
||||
# Recommend form
|
||||
# Use long form when:
|
||||
# 1. Low confidence (< 0.6)
|
||||
# 2. Low completeness (< 0.5 of dimensions provided)
|
||||
# 3. Near boundary (< 0.5 distance)
|
||||
if adjusted_confidence < 0.6 or completeness < 0.5 or distance_to_boundary < 0.5:
|
||||
recommended_form = 'long'
|
||||
else:
|
||||
recommended_form = 'short'
|
||||
|
||||
if not return_details:
|
||||
return {
|
||||
'cluster': int(cluster),
|
||||
'cluster_name': self.CLUSTER_NAMES[cluster],
|
||||
'confidence': float(adjusted_confidence),
|
||||
'recommended_form': recommended_form
|
||||
}
|
||||
|
||||
# Calculate distances to each centroid
|
||||
distances = {}
|
||||
for cluster_id, centroid in self.cluster_centroids.items():
|
||||
dist = np.linalg.norm(X_scaled - centroid)
|
||||
distances[cluster_id] = float(dist)
|
||||
|
||||
return {
|
||||
'cluster': int(cluster),
|
||||
'cluster_name': self.CLUSTER_NAMES[cluster],
|
||||
'confidence': float(adjusted_confidence),
|
||||
'completeness': float(completeness),
|
||||
'dimensions_provided': provided_count,
|
||||
'dimensions_total': len(self.DIMENSIONS),
|
||||
'recommended_form': recommended_form,
|
||||
'distance_to_boundary': float(distance_to_boundary),
|
||||
'lda_score': float(lda_score),
|
||||
'distances_to_centroids': distances,
|
||||
'key_dimensions_provided': sum(1 for dim in self.KEY_DIMENSIONS if dim in ratings),
|
||||
'key_dimensions_total': len(self.KEY_DIMENSIONS),
|
||||
}
|
||||
|
||||
def get_key_dimensions(self):
|
||||
"""Return the most important dimensions for classification."""
|
||||
return self.KEY_DIMENSIONS.copy()
|
||||
|
||||
def get_short_form_dimensions(self):
|
||||
"""Return recommended dimensions for short form."""
|
||||
return self.KEY_DIMENSIONS
|
||||
|
||||
def explain_classification(self, ratings):
|
||||
"""
|
||||
Provide human-readable explanation of classification.
|
||||
|
||||
Args:
|
||||
ratings: Dict mapping dimension names to values
|
||||
|
||||
Returns:
|
||||
String explanation
|
||||
"""
|
||||
result = self.predict(ratings, return_details=True)
|
||||
|
||||
explanation = []
|
||||
explanation.append(f"Protocol Classification: {result['cluster_name']}")
|
||||
explanation.append(f"Confidence: {result['confidence']:.0%}")
|
||||
explanation.append(f"")
|
||||
|
||||
if result['lda_score'] > 0:
|
||||
explanation.append(f"This protocol leans toward Institutional/Bureaucratic characteristics:")
|
||||
explanation.append(f" - More likely to be formal, standardized, top-down")
|
||||
explanation.append(f" - May involve state/corporate enforcement")
|
||||
explanation.append(f" - Tends toward precise, documented procedures")
|
||||
else:
|
||||
explanation.append(f"This protocol leans toward Relational/Cultural characteristics:")
|
||||
explanation.append(f" - More likely to be emergent, community-based")
|
||||
explanation.append(f" - May involve voluntary participation")
|
||||
explanation.append(f" - Tends toward interpretive, flexible practices")
|
||||
|
||||
explanation.append(f"")
|
||||
explanation.append(f"Distance from boundary: {result['distance_to_boundary']:.2f}")
|
||||
|
||||
if result['distance_to_boundary'] < 0.5:
|
||||
explanation.append(f"⚠️ This protocol is near the boundary between families.")
|
||||
explanation.append(f" It may exhibit characteristics of both types.")
|
||||
|
||||
explanation.append(f"")
|
||||
explanation.append(f"Completeness: {result['completeness']:.0%} ({result['dimensions_provided']}/{result['dimensions_total']} dimensions)")
|
||||
|
||||
if result['completeness'] < 1.0:
|
||||
explanation.append(f"Note: Missing dimensions filled with neutral values (5)")
|
||||
explanation.append(f" Confidence improves with complete data")
|
||||
|
||||
explanation.append(f"")
|
||||
explanation.append(f"Recommended form: {result['recommended_form'].upper()}")
|
||||
|
||||
if result['recommended_form'] == 'long':
|
||||
explanation.append(f"Reason: Use long form for:")
|
||||
if result['confidence'] < 0.6:
|
||||
explanation.append(f" - Low classification confidence")
|
||||
if result['completeness'] < 0.5:
|
||||
explanation.append(f" - Incomplete data")
|
||||
if result['distance_to_boundary'] < 0.5:
|
||||
explanation.append(f" - Ambiguous positioning between families")
|
||||
else:
|
||||
explanation.append(f"Reason: High confidence classification with {result['completeness']:.0%} data")
|
||||
|
||||
return "\n".join(explanation)
|
||||
|
||||
def save_model(self, output_path='bicorder_classifier_model.json'):
|
||||
"""Save model parameters for use without scikit-learn."""
|
||||
model_data = {
|
||||
'dimensions': self.DIMENSIONS,
|
||||
'key_dimensions': self.KEY_DIMENSIONS,
|
||||
'cluster_names': self.CLUSTER_NAMES,
|
||||
'scaler_mean': self.scaler.mean_.tolist(),
|
||||
'scaler_std': self.scaler.scale_.tolist(),
|
||||
'lda_coef': self.lda.coef_.tolist(),
|
||||
'lda_intercept': self.lda.intercept_.tolist(),
|
||||
'cluster_centroids': {
|
||||
str(k): v.tolist() for k, v in self.cluster_centroids.items()
|
||||
}
|
||||
}
|
||||
|
||||
with open(output_path, 'w') as f:
|
||||
json.dump(model_data, f, indent=2)
|
||||
|
||||
print(f"Model saved to {output_path}")
|
||||
return output_path
|
||||
|
||||
|
||||
def main():
|
||||
"""Demo usage of the classifier."""
|
||||
print("=" * 80)
|
||||
print("BICORDER CLUSTER CLASSIFIER - DEMO")
|
||||
print("=" * 80)
|
||||
|
||||
classifier = BicorderClassifier()
|
||||
|
||||
# Example 1: Relational/Cultural protocol (e.g., Indigenous knowledge sharing)
|
||||
print("\nExample 1: Community-Based Protocol")
|
||||
print("-" * 80)
|
||||
ratings_relational = {
|
||||
'Design_elite_vs_vernacular': 9, # Very vernacular
|
||||
'Design_explicit_vs_implicit': 8, # More implicit
|
||||
'Entanglement_flocking_vs_swarming': 9, # Swarming
|
||||
'Entanglement_obligatory_vs_voluntary': 9, # Voluntary
|
||||
'Design_static_vs_malleable': 8, # Malleable
|
||||
'Design_technical_vs_social': 9, # Social
|
||||
}
|
||||
|
||||
print(classifier.explain_classification(ratings_relational))
|
||||
|
||||
# Example 2: Institutional protocol (e.g., Airport security)
|
||||
print("\n\n" + "=" * 80)
|
||||
print("Example 2: Institutional Protocol")
|
||||
print("-" * 80)
|
||||
ratings_institutional = {
|
||||
'Design_elite_vs_vernacular': 1, # Elite
|
||||
'Design_explicit_vs_implicit': 1, # Very explicit
|
||||
'Entanglement_flocking_vs_swarming': 1, # Flocking
|
||||
'Entanglement_obligatory_vs_voluntary': 1, # Obligatory
|
||||
'Design_static_vs_malleable': 2, # Static
|
||||
'Design_technical_vs_social': 2, # Technical
|
||||
'Entanglement_sovereign_vs_subsidiary': 1, # Sovereign
|
||||
}
|
||||
|
||||
print(classifier.explain_classification(ratings_institutional))
|
||||
|
||||
# Example 3: Ambiguous/boundary protocol
|
||||
print("\n\n" + "=" * 80)
|
||||
print("Example 3: Boundary Protocol (mixed characteristics)")
|
||||
print("-" * 80)
|
||||
ratings_boundary = {
|
||||
'Design_elite_vs_vernacular': 5, # Middle
|
||||
'Design_explicit_vs_implicit': 4, # Slightly implicit
|
||||
'Entanglement_flocking_vs_swarming': 5, # Middle
|
||||
'Entanglement_obligatory_vs_voluntary': 6, # Slightly voluntary
|
||||
}
|
||||
|
||||
print(classifier.explain_classification(ratings_boundary))
|
||||
|
||||
# Save model
|
||||
print("\n\n" + "=" * 80)
|
||||
classifier.save_model()
|
||||
print("\nKey dimensions for short form:")
|
||||
for dim in classifier.get_key_dimensions():
|
||||
print(f" - {dim}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
95
analysis/scripts/bicorder_init.py
Normal file
95
analysis/scripts/bicorder_init.py
Normal file
@@ -0,0 +1,95 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Initialize LLM conversation with bicorder framework and protocol context.
|
||||
|
||||
This script reads a protocol from the CSV and the bicorder.json framework,
|
||||
then generates a prompt to initialize the LLM conversation.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_bicorder_config(bicorder_path):
|
||||
"""Load and parse the bicorder.json configuration file."""
|
||||
with open(bicorder_path, 'r') as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def get_protocol_by_row(csv_path, row_number):
|
||||
"""Get protocol data from CSV by row number (1-indexed)."""
|
||||
with open(csv_path, 'r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for i, row in enumerate(reader, start=1):
|
||||
if i == row_number:
|
||||
return {
|
||||
'descriptor': row.get('Descriptor', '').strip(),
|
||||
'description': row.get('Description', '').strip()
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
def generate_init_prompt(protocol, bicorder_data):
|
||||
"""Generate the initialization prompt for the LLM."""
|
||||
|
||||
# Ultra-minimal version for system prompt
|
||||
prompt = f"""Analyze this protocol: "{protocol['descriptor']}"
|
||||
|
||||
Description: {protocol['description']}
|
||||
|
||||
Task: Rate this protocol on diagnostic gradients using scale 1-9 (1=left term, 5=neutral/balanced, 9=right term). Respond with just the number and brief explanation."""
|
||||
|
||||
return prompt
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Initialize LLM conversation with protocol and bicorder framework',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Example usage:
|
||||
# Initialize conversation for protocol in row 1
|
||||
python3 bicorder_init.py protocols_edited.csv 1 | llm -m mistral --save init_1
|
||||
|
||||
# Initialize for row 5
|
||||
python3 bicorder_init.py protocols_edited.csv 5 | llm -m mistral --save init_5
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('input_csv', help='Input CSV file with protocol data')
|
||||
parser.add_argument('row_number', type=int, help='Row number to analyze (1-indexed)')
|
||||
parser.add_argument('-b', '--bicorder',
|
||||
default='../bicorder.json',
|
||||
help='Path to bicorder.json (default: ../bicorder.json)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate input file exists
|
||||
if not Path(args.input_csv).exists():
|
||||
print(f"Error: Input file '{args.input_csv}' not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Validate bicorder.json exists
|
||||
if not Path(args.bicorder).exists():
|
||||
print(f"Error: Bicorder config '{args.bicorder}' not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Load protocol
|
||||
protocol = get_protocol_by_row(args.input_csv, args.row_number)
|
||||
if protocol is None:
|
||||
print(f"Error: Row {args.row_number} not found in CSV", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Load bicorder config
|
||||
bicorder_data = load_bicorder_config(args.bicorder)
|
||||
|
||||
# Generate and output prompt
|
||||
prompt = generate_init_prompt(protocol, bicorder_data)
|
||||
print(prompt)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
230
analysis/scripts/bicorder_query.py
Normal file
230
analysis/scripts/bicorder_query.py
Normal file
@@ -0,0 +1,230 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Query LLM for individual gradient values and update CSV.
|
||||
|
||||
This script generates prompts for each gradient, queries the LLM conversation,
|
||||
and updates the CSV with the returned values.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
import argparse
|
||||
import subprocess
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_bicorder_config(bicorder_path):
|
||||
"""Load and parse the bicorder.json configuration file."""
|
||||
with open(bicorder_path, 'r') as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def extract_gradients(bicorder_data):
|
||||
"""Extract all gradients from the diagnostic sets."""
|
||||
gradients = []
|
||||
for diagnostic_set in bicorder_data['diagnostic']:
|
||||
set_name = diagnostic_set['set_name']
|
||||
|
||||
for gradient in diagnostic_set['gradients']:
|
||||
col_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
|
||||
gradients.append({
|
||||
'column_name': col_name,
|
||||
'set_name': set_name,
|
||||
'term_left': gradient['term_left'],
|
||||
'term_left_description': gradient['term_left_description'],
|
||||
'term_right': gradient['term_right'],
|
||||
'term_right_description': gradient['term_right_description']
|
||||
})
|
||||
|
||||
return gradients
|
||||
|
||||
|
||||
def get_protocol_by_row(csv_path, row_number):
|
||||
"""Get protocol data from CSV by row number (1-indexed)."""
|
||||
with open(csv_path, 'r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for i, row in enumerate(reader, start=1):
|
||||
if i == row_number:
|
||||
return {
|
||||
'descriptor': row.get('Descriptor', '').strip(),
|
||||
'description': row.get('Description', '').strip()
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
def generate_gradient_prompt(protocol_descriptor, protocol_description, gradient):
|
||||
"""Generate a prompt for a single gradient evaluation."""
|
||||
return f"""Analyze this protocol: "{protocol_descriptor}"
|
||||
|
||||
Description: {protocol_description}
|
||||
|
||||
Evaluate the protocol on this gradient:
|
||||
|
||||
**{gradient['term_left']}** (1) vs **{gradient['term_right']}** (9)
|
||||
|
||||
- **{gradient['term_left']}**: {gradient['term_left_description']}
|
||||
- **{gradient['term_right']}**: {gradient['term_right_description']}
|
||||
|
||||
Provide a rating from 1 to 9, where:
|
||||
- 1 = strongly {gradient['term_left']}
|
||||
- 5 = neutral/balanced/not applicable
|
||||
- 9 = strongly {gradient['term_right']}
|
||||
|
||||
Respond with ONLY the number (1-9), optionally followed by a brief explanation.
|
||||
"""
|
||||
|
||||
|
||||
def query_llm(prompt, model=None):
|
||||
"""Send prompt to llm CLI and get response."""
|
||||
cmd = ['llm']
|
||||
if model:
|
||||
cmd.extend(['-m', model])
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
input=prompt,
|
||||
text=True,
|
||||
capture_output=True,
|
||||
check=True
|
||||
)
|
||||
return result.stdout.strip()
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f" Error calling llm: {e.stderr}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def extract_value(llm_response):
|
||||
"""Extract numeric value (1-9) from LLM response."""
|
||||
# Look for a number 1-9 at the start of the response
|
||||
match = re.search(r'^(\d)', llm_response.strip())
|
||||
if match:
|
||||
value = int(match.group(1))
|
||||
if 1 <= value <= 9:
|
||||
return value
|
||||
return None
|
||||
|
||||
|
||||
def update_csv_cell(csv_path, row_number, column_name, value):
|
||||
"""Update a specific cell in the CSV."""
|
||||
# Read all rows
|
||||
rows = []
|
||||
with open(csv_path, 'r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
fieldnames = reader.fieldnames
|
||||
for row in reader:
|
||||
rows.append(row)
|
||||
|
||||
# Update the specific cell
|
||||
if row_number <= len(rows):
|
||||
rows[row_number - 1][column_name] = str(value)
|
||||
|
||||
# Write back
|
||||
with open(csv_path, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Query LLM for gradient values and update CSV',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Example usage:
|
||||
# Query all gradients for protocol in row 1
|
||||
python3 bicorder_query.py analysis_output.csv 1
|
||||
|
||||
# Query specific model
|
||||
python3 bicorder_query.py analysis_output.csv 1 -m mistral
|
||||
|
||||
# Dry run (show prompts without calling LLM)
|
||||
python3 bicorder_query.py analysis_output.csv 1 --dry-run
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('csv_path', help='CSV file to update')
|
||||
parser.add_argument('row_number', type=int, help='Row number to analyze (1-indexed)')
|
||||
parser.add_argument('-b', '--bicorder',
|
||||
default='../bicorder.json',
|
||||
help='Path to bicorder.json (default: ../bicorder.json)')
|
||||
parser.add_argument('-m', '--model', help='LLM model to use')
|
||||
parser.add_argument('--dry-run', action='store_true',
|
||||
help='Show prompts without calling LLM or updating CSV')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate files exist
|
||||
if not Path(args.csv_path).exists():
|
||||
print(f"Error: CSV file '{args.csv_path}' not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not Path(args.bicorder).exists():
|
||||
print(f"Error: Bicorder config '{args.bicorder}' not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Load protocol data
|
||||
protocol = get_protocol_by_row(args.csv_path, args.row_number)
|
||||
if protocol is None:
|
||||
print(f"Error: Row {args.row_number} not found in CSV", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Load bicorder config
|
||||
bicorder_data = load_bicorder_config(args.bicorder)
|
||||
gradients = extract_gradients(bicorder_data)
|
||||
|
||||
if args.dry_run:
|
||||
print(f"DRY RUN: Row {args.row_number}, {len(gradients)} gradients")
|
||||
print(f"Protocol: {protocol['descriptor']}\n")
|
||||
else:
|
||||
print(f"Protocol: {protocol['descriptor']}")
|
||||
print(f"Loaded {len(gradients)} gradients, starting queries...")
|
||||
|
||||
# Process each gradient
|
||||
for i, gradient in enumerate(gradients, 1):
|
||||
gradient_short = gradient['column_name'].replace('_', ' ')
|
||||
|
||||
if not args.dry_run:
|
||||
print(f"[{i}/{len(gradients)}] Querying: {gradient_short}...", flush=True)
|
||||
|
||||
# Generate prompt (including protocol context)
|
||||
prompt = generate_gradient_prompt(
|
||||
protocol['descriptor'],
|
||||
protocol['description'],
|
||||
gradient
|
||||
)
|
||||
|
||||
if args.dry_run:
|
||||
print(f"[{i}/{len(gradients)}] {gradient_short}")
|
||||
print(f"Prompt:\n{prompt}\n")
|
||||
continue
|
||||
|
||||
# Query LLM (new chat each time)
|
||||
response = query_llm(prompt, args.model)
|
||||
|
||||
if response is None:
|
||||
print(f"[{i}/{len(gradients)}] {gradient_short}: FAILED")
|
||||
continue
|
||||
|
||||
# Extract value
|
||||
value = extract_value(response)
|
||||
if value is None:
|
||||
print(f"[{i}/{len(gradients)}] {gradient_short}: WARNING - no valid value")
|
||||
continue
|
||||
|
||||
# Update CSV
|
||||
if update_csv_cell(args.csv_path, args.row_number, gradient['column_name'], value):
|
||||
print(f"[{i}/{len(gradients)}] {gradient_short}: {value}")
|
||||
else:
|
||||
print(f"[{i}/{len(gradients)}] {gradient_short}: ERROR updating CSV")
|
||||
|
||||
if not args.dry_run:
|
||||
print(f"\n✓ CSV updated: {args.csv_path}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
8
analysis/scripts/chunk.sh
Normal file
8
analysis/scripts/chunk.sh
Normal file
@@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
prompt="Return csv-formatted data (with no markdown wrapper) that consists of a list of protocols discussed or referred to in the attached text. Protocols are defined extremely broadly as 'patterns of interaction,' and may be of a nontechnical nature. Protocols should be as specific as possible, such as 'Sacrament of Reconciliation' rather than 'Religious Protocols.' The first column should provide a brief descriptor of the protocol, and the second column should describe it in a substantial paragraph of 3-5 sentences, encapsulated in quotation marks to avoid breaking on commas. Be sure to paraphrase rather than quoting directly from the source text."
|
||||
|
||||
for file in "$@"; do
|
||||
llm -m gemma3:12b -f $file "$prompt" >> output.csv
|
||||
echo "Completed $file"
|
||||
done
|
||||
102
analysis/scripts/classify_readings.py
Normal file
102
analysis/scripts/classify_readings.py
Normal file
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Apply the BicorderClassifier to all readings in a CSV and save results.
|
||||
|
||||
Uses the synthetic-trained LDA model by default. Missing dimensions are
|
||||
filled with the neutral value (5), so shortform readings can still be
|
||||
classified — though with lower confidence.
|
||||
|
||||
Usage:
|
||||
python3 scripts/classify_readings.py data/readings/manual_20260320/readings.csv
|
||||
python3 scripts/classify_readings.py data/readings/manual_20260320/readings.csv \\
|
||||
--training data/readings/synthetic_20251116/readings.csv \\
|
||||
--output data/readings/manual_20260320/analysis/classifications.csv
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from bicorder_classifier import BicorderClassifier
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Classify all readings in a CSV using the BicorderClassifier'
|
||||
)
|
||||
parser.add_argument('input_csv', help='Readings CSV to classify')
|
||||
parser.add_argument(
|
||||
'--training',
|
||||
default='data/readings/synthetic_20251116/readings.csv',
|
||||
help='Training CSV for classifier (default: synthetic_20251116)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output', default=None,
|
||||
help='Output CSV path (default: <dataset>/analysis/classifications.csv)'
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
input_path = Path(args.input_csv)
|
||||
output_path = (
|
||||
Path(args.output) if args.output
|
||||
else input_path.parent / 'analysis' / 'classifications.csv'
|
||||
)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"Loading classifier (training: {args.training})...")
|
||||
classifier = BicorderClassifier(diagnostic_csv=args.training)
|
||||
|
||||
df = pd.read_csv(input_path)
|
||||
print(f"Classifying {len(df)} readings from {input_path}...")
|
||||
|
||||
rows = []
|
||||
for _, record in df.iterrows():
|
||||
# Build ratings dict from dimension columns only
|
||||
ratings = {
|
||||
col: float(record[col])
|
||||
for col in classifier.DIMENSIONS
|
||||
if col in record and pd.notna(record[col])
|
||||
}
|
||||
|
||||
result = classifier.predict(ratings, return_details=True)
|
||||
|
||||
rows.append({
|
||||
'Descriptor': record.get('Descriptor', ''),
|
||||
'analyst': record.get('analyst', ''),
|
||||
'standpoint': record.get('standpoint', ''),
|
||||
'shortform': record.get('shortform', ''),
|
||||
'cluster': result['cluster'],
|
||||
'cluster_name': result['cluster_name'],
|
||||
'confidence': round(result['confidence'], 3),
|
||||
'lda_score': round(result['lda_score'], 3),
|
||||
'distance_to_boundary': round(result['distance_to_boundary'], 3),
|
||||
'completeness': round(result['completeness'], 3),
|
||||
'dimensions_provided': result['dimensions_provided'],
|
||||
'key_dims_provided': result['key_dimensions_provided'],
|
||||
'recommended_form': result['recommended_form'],
|
||||
})
|
||||
|
||||
out_df = pd.DataFrame(rows)
|
||||
out_df.to_csv(output_path, index=False)
|
||||
print(f"Classifications saved → {output_path}")
|
||||
|
||||
# Summary
|
||||
counts = out_df['cluster_name'].value_counts()
|
||||
print(f"\nCluster summary:")
|
||||
for name, count in counts.items():
|
||||
pct = count / len(out_df) * 100
|
||||
print(f" {name}: {count} ({pct:.0f}%)")
|
||||
|
||||
low_conf = (out_df['confidence'] < 0.4).sum()
|
||||
if low_conf:
|
||||
print(f"\n {low_conf} readings with low confidence (<0.4) — may be boundary cases")
|
||||
|
||||
shortform_count = out_df[out_df['shortform'].astype(str) == 'True'].shape[0]
|
||||
if shortform_count:
|
||||
print(f"\n {shortform_count} shortform readings classified (missing dims filled with neutral 5)")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
186
analysis/scripts/compare_analyses.py
Normal file
186
analysis/scripts/compare_analyses.py
Normal file
@@ -0,0 +1,186 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Compare multiple analysis CSV files to determine which most closely resembles a reference file.
|
||||
Uses Euclidean distance, correlation, and RMSE metrics.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy.stats import pearsonr
|
||||
from pathlib import Path
|
||||
|
||||
def calculate_euclidean_distance(df1, df2, numeric_cols):
|
||||
"""Calculate Euclidean distance between two dataframes."""
|
||||
distances = []
|
||||
for idx in df1.index:
|
||||
diff = df1.loc[idx, numeric_cols] - df2.loc[idx, numeric_cols]
|
||||
# Use nansum to ignore NaN values
|
||||
distance = np.sqrt(np.nansum(diff ** 2))
|
||||
distances.append(distance)
|
||||
return np.array(distances)
|
||||
|
||||
def calculate_rmse(df1, df2, numeric_cols):
|
||||
"""Calculate Root Mean Squared Error."""
|
||||
diff = df1[numeric_cols] - df2[numeric_cols]
|
||||
# Use nanmean to ignore NaN values
|
||||
mse = np.nanmean(diff.values ** 2)
|
||||
return np.sqrt(mse)
|
||||
|
||||
def calculate_correlation(df1, df2, numeric_cols):
|
||||
"""Calculate Pearson correlation across all numeric values."""
|
||||
vals1 = df1[numeric_cols].values.flatten()
|
||||
vals2 = df2[numeric_cols].values.flatten()
|
||||
|
||||
# Remove NaN values - only use positions where both have valid values
|
||||
mask = ~(np.isnan(vals1) | np.isnan(vals2))
|
||||
vals1_clean = vals1[mask]
|
||||
vals2_clean = vals2[mask]
|
||||
|
||||
if len(vals1_clean) < 2:
|
||||
return np.nan, np.nan
|
||||
|
||||
corr, pvalue = pearsonr(vals1_clean, vals2_clean)
|
||||
return corr, pvalue
|
||||
|
||||
def compare_analyses(reference_file, comparison_files):
|
||||
"""Compare multiple analysis files to a reference file."""
|
||||
|
||||
# Read reference file
|
||||
print(f"Reading reference file: {reference_file}")
|
||||
ref_df = pd.read_csv(reference_file, quotechar='"', escapechar='\\', engine='python')
|
||||
# Get numeric columns (all the rating dimensions)
|
||||
numeric_cols = [col for col in ref_df.columns if
|
||||
col.startswith(('Design_', 'Entanglement_', 'Experience_'))]
|
||||
|
||||
# Convert numeric columns to numeric type, coercing errors to NaN
|
||||
for col in numeric_cols:
|
||||
ref_df[col] = pd.to_numeric(ref_df[col], errors='coerce')
|
||||
|
||||
print(f"\nFound {len(numeric_cols)} numeric dimensions to compare")
|
||||
print(f"Comparing {len(ref_df)} protocols\n")
|
||||
print("="*80)
|
||||
|
||||
results = {}
|
||||
|
||||
for comp_file in comparison_files:
|
||||
print(f"\nComparing: {Path(comp_file).name}")
|
||||
print("-"*80)
|
||||
|
||||
# Read comparison file
|
||||
comp_df = pd.read_csv(comp_file, quotechar='"', escapechar='\\', engine='python')
|
||||
|
||||
# Convert numeric columns to numeric type, coercing errors to NaN
|
||||
for col in numeric_cols:
|
||||
comp_df[col] = pd.to_numeric(comp_df[col], errors='coerce')
|
||||
|
||||
# Ensure same protocols in same order (match by Descriptor)
|
||||
if 'Descriptor' in ref_df.columns and 'Descriptor' in comp_df.columns:
|
||||
# Use merge to ensure exact matching - only keep protocols in ref_df
|
||||
comp_df = pd.merge(
|
||||
ref_df[['Descriptor']],
|
||||
comp_df,
|
||||
on='Descriptor',
|
||||
how='left'
|
||||
)
|
||||
|
||||
# Calculate Euclidean distances using reset indices to ensure alignment
|
||||
ref_temp = ref_df.reset_index(drop=True)
|
||||
comp_temp = comp_df.reset_index(drop=True)
|
||||
euclidean_distances = calculate_euclidean_distance(ref_temp, comp_temp, numeric_cols)
|
||||
total_euclidean = np.sum(euclidean_distances)
|
||||
avg_euclidean = np.mean(euclidean_distances)
|
||||
|
||||
# Calculate RMSE
|
||||
rmse = calculate_rmse(ref_temp, comp_temp, numeric_cols)
|
||||
|
||||
# Calculate correlation
|
||||
correlation, p_value = calculate_correlation(ref_temp, comp_temp, numeric_cols)
|
||||
|
||||
# Store results
|
||||
results[Path(comp_file).name] = {
|
||||
'total_euclidean': total_euclidean,
|
||||
'avg_euclidean': avg_euclidean,
|
||||
'rmse': rmse,
|
||||
'correlation': correlation,
|
||||
'p_value': p_value,
|
||||
'per_protocol_distances': euclidean_distances,
|
||||
'protocols': ref_df['Descriptor'].values if 'Descriptor' in ref_df.columns else None
|
||||
}
|
||||
|
||||
# Print results
|
||||
print(f" Total Euclidean Distance: {total_euclidean:.2f}")
|
||||
print(f" Average Euclidean Distance: {avg_euclidean:.2f}")
|
||||
print(f" RMSE: {rmse:.2f}")
|
||||
print(f" Pearson Correlation: {correlation:.4f} (p={p_value:.2e})")
|
||||
|
||||
# Summary comparison
|
||||
print("\n" + "="*80)
|
||||
print("SUMMARY RANKING (lower distance = more similar)")
|
||||
print("="*80)
|
||||
|
||||
# Sort by average Euclidean distance
|
||||
sorted_by_euclidean = sorted(results.items(), key=lambda x: x[1]['avg_euclidean'])
|
||||
|
||||
print("\nBy Average Euclidean Distance:")
|
||||
for i, (name, data) in enumerate(sorted_by_euclidean, 1):
|
||||
print(f" {i}. {name:30s} - Avg Distance: {data['avg_euclidean']:.2f}")
|
||||
|
||||
# Sort by correlation (higher is better)
|
||||
sorted_by_corr = sorted(results.items(), key=lambda x: x[1]['correlation'], reverse=True)
|
||||
|
||||
print("\nBy Correlation (higher = more similar):")
|
||||
for i, (name, data) in enumerate(sorted_by_corr, 1):
|
||||
print(f" {i}. {name:30s} - Correlation: {data['correlation']:.4f}")
|
||||
|
||||
# Sort by RMSE
|
||||
sorted_by_rmse = sorted(results.items(), key=lambda x: x[1]['rmse'])
|
||||
|
||||
print("\nBy RMSE (lower = more similar):")
|
||||
for i, (name, data) in enumerate(sorted_by_rmse, 1):
|
||||
print(f" {i}. {name:30s} - RMSE: {data['rmse']:.2f}")
|
||||
|
||||
# Show protocols with largest differences for the best match
|
||||
print("\n" + "="*80)
|
||||
best_match_name, best_match_data = sorted_by_euclidean[0]
|
||||
print(f"Top 10 protocols with largest differences from {best_match_name}:")
|
||||
print("="*80)
|
||||
|
||||
if best_match_data['protocols'] is not None:
|
||||
distances = best_match_data['per_protocol_distances']
|
||||
protocols = best_match_data['protocols']
|
||||
top_diff_indices = np.argsort(distances)[-10:][::-1]
|
||||
|
||||
for idx in top_diff_indices:
|
||||
print(f" {protocols[idx]:50s} - Distance: {distances[idx]:.2f}")
|
||||
|
||||
return results
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Define file paths
|
||||
reference_file = "data/readings/synthetic_20251116/readings_manual.csv"
|
||||
comparison_files = [
|
||||
"data/readings/synthetic_20251116/readings_gemma3-12b.csv",
|
||||
"data/readings/synthetic_20251116/readings_gpt-oss.csv",
|
||||
"data/readings/synthetic_20251116/readings_mistral.csv"
|
||||
]
|
||||
|
||||
# Check if files exist
|
||||
if not Path(reference_file).exists():
|
||||
print(f"Error: Reference file '{reference_file}' not found")
|
||||
exit(1)
|
||||
|
||||
for file in comparison_files:
|
||||
if not Path(file).exists():
|
||||
print(f"Warning: Comparison file '{file}' not found, skipping...")
|
||||
comparison_files.remove(file)
|
||||
|
||||
if not comparison_files:
|
||||
print("Error: No comparison files found")
|
||||
exit(1)
|
||||
|
||||
# Run comparison
|
||||
results = compare_analyses(reference_file, comparison_files)
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("Analysis complete!")
|
||||
print("="*80)
|
||||
189
analysis/scripts/convert_csv_to_json.py
Normal file
189
analysis/scripts/convert_csv_to_json.py
Normal file
@@ -0,0 +1,189 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convert diagnostic_output.csv to individual JSON files following the bicorder.json spec.
|
||||
Handles mapping between old CSV column names and current spec terminology.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Optional
|
||||
import statistics
|
||||
|
||||
|
||||
# Mapping from CSV columns to spec terms
|
||||
# Format: (csv_column_suffix, set_name, term_left, term_right)
|
||||
GRADIENT_MAPPINGS = [
|
||||
# Design set
|
||||
("explicit_vs_implicit", "Design", "explicit", "implicit"),
|
||||
("precise_vs_interpretive", "Design", "precise", "interpretive"),
|
||||
("elite_vs_vernacular", "Design", "institutional", "vernacular"), # Changed: elite → institutional
|
||||
("documenting_vs_enabling", "Design", "documenting", "enabling"),
|
||||
("static_vs_malleable", "Design", "static", "malleable"),
|
||||
("technical_vs_social", "Design", "technical", "social"),
|
||||
("universal_vs_particular", "Design", "universal", "particular"),
|
||||
("durable_vs_ephemeral", "Design", "durable", "ephemeral"),
|
||||
|
||||
# Entanglement set
|
||||
("macro_vs_micro", "Entanglement", "macro", "micro"),
|
||||
("sovereign_vs_subsidiary", "Entanglement", "sovereign", "subsidiary"),
|
||||
("self-enforcing_vs_enforced", "Entanglement", "self-enforcing", "enforced"),
|
||||
("abstract_vs_embodied", "Entanglement", "abstract", "embodied"),
|
||||
("obligatory_vs_voluntary", "Entanglement", "obligatory", "voluntary"),
|
||||
("flocking_vs_swarming", "Entanglement", "flocking", "swarming"),
|
||||
("defensible_vs_exposed", "Entanglement", "defensible", "exposed"),
|
||||
("exclusive_vs_non-exclusive", "Entanglement", "monopolistic", "pluralistic"), # Changed: exclusive → monopolistic
|
||||
|
||||
# Experience set
|
||||
("sufficient_vs_insufficient", "Experience", "sufficient", "limited"), # Changed: insufficient → limited
|
||||
("crystallized_vs_contested", "Experience", "crystallized", "contested"),
|
||||
("trust-evading_vs_trust-inducing", "Experience", "trust-evading", "trust-inducing"),
|
||||
("predictable_vs_emergent", "Experience", "predictable", "emergent"),
|
||||
("exclusion_vs_inclusion", "Experience", "exclusion", "inclusion"),
|
||||
("Kafka_vs_Whitehead", "Experience", "restraining", "liberating"), # Changed: Kafka_vs_Whitehead → restraining_vs_liberating
|
||||
("dead_vs_alive", "Experience", "dead", "alive"),
|
||||
]
|
||||
|
||||
|
||||
def load_spec_template(spec_path: str) -> Dict[str, Any]:
|
||||
"""Load the bicorder.json spec as a template."""
|
||||
with open(spec_path, 'r') as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def calculate_hardness(gradient_values: List[Optional[int]]) -> Optional[int]:
|
||||
"""
|
||||
Calculate hardness: mean of all gradient values, rounded to nearest integer.
|
||||
Returns None if there are no valid values.
|
||||
"""
|
||||
valid_values = [v for v in gradient_values if v is not None]
|
||||
if not valid_values:
|
||||
return None
|
||||
return round(statistics.mean(valid_values))
|
||||
|
||||
|
||||
def calculate_polarization(gradient_values: List[Optional[int]]) -> Optional[int]:
|
||||
"""
|
||||
Calculate polarization: degree to which values are extreme vs centered.
|
||||
If all values are 1 or 9 (max polarization), return 1.
|
||||
If all values are 5 (centered), return 9.
|
||||
Returns None if there are no valid values.
|
||||
"""
|
||||
valid_values = [v for v in gradient_values if v is not None]
|
||||
if not valid_values:
|
||||
return None
|
||||
|
||||
# Calculate average distance from center (5)
|
||||
distances = [abs(v - 5) for v in valid_values]
|
||||
avg_distance = statistics.mean(distances)
|
||||
|
||||
# Max distance is 4 (from 1 or 9 to 5)
|
||||
# Convert to scale where 4 → 1 (polarized) and 0 → 9 (centrist)
|
||||
# Linear mapping: polarization = 9 - (avg_distance / 4) * 8
|
||||
polarization = 9 - (avg_distance / 4) * 8
|
||||
|
||||
return round(polarization)
|
||||
|
||||
|
||||
def create_json_from_row(row: Dict[str, str], template: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Convert a CSV row to a JSON object following the spec."""
|
||||
result = json.loads(json.dumps(template)) # Deep copy
|
||||
|
||||
# Update metadata
|
||||
result["metadata"]["protocol"] = row["Descriptor"]
|
||||
result["metadata"]["description"] = row["Description"]
|
||||
result["metadata"]["analyst"] = row["analyst"]
|
||||
result["metadata"]["standpoint"] = row["standpoint"]
|
||||
result["metadata"]["timestamp"] = None # Not in CSV
|
||||
|
||||
# Collect gradient values for analysis calculations
|
||||
gradient_values = []
|
||||
|
||||
# Map CSV values to gradient objects
|
||||
for csv_suffix, set_name, term_left, term_right in GRADIENT_MAPPINGS:
|
||||
csv_column = f"{set_name}_{csv_suffix}"
|
||||
|
||||
# Get the value from CSV (may be empty string)
|
||||
csv_value = row.get(csv_column, "").strip()
|
||||
value = int(csv_value) if csv_value else None
|
||||
|
||||
if value is not None:
|
||||
gradient_values.append(value)
|
||||
|
||||
# Find the corresponding gradient in the template
|
||||
for diagnostic_set in result["diagnostic"]:
|
||||
if diagnostic_set["set_name"] == set_name:
|
||||
for gradient in diagnostic_set["gradients"]:
|
||||
if gradient["term_left"] == term_left and gradient["term_right"] == term_right:
|
||||
gradient["value"] = value
|
||||
break
|
||||
|
||||
# Calculate automated analysis fields
|
||||
result["analysis"][0]["value"] = calculate_hardness(gradient_values) # hardness
|
||||
result["analysis"][1]["value"] = calculate_polarization(gradient_values) # polarized
|
||||
# analysis[2] is bureaucratic (LDA-based) - leave as null
|
||||
# analysis[3] is usefulness - leave as null (not automated)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
"""Main conversion process."""
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Convert diagnostic readings CSV to individual JSON files',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Example usage:
|
||||
python3 scripts/convert_csv_to_json.py data/readings/synthetic_20251116/readings.csv
|
||||
python3 scripts/convert_csv_to_json.py data/readings/manual_20260101/readings.csv --output-dir data/readings/manual_20260101/json
|
||||
"""
|
||||
)
|
||||
parser.add_argument('input_csv', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
|
||||
parser.add_argument('--output-dir', default=None,
|
||||
help='Output directory for JSON files (default: <dataset_dir>/json)')
|
||||
parser.add_argument('--bicorder', default='../bicorder.json',
|
||||
help='Path to bicorder.json (default: ../bicorder.json)')
|
||||
args = parser.parse_args()
|
||||
|
||||
csv_path = args.input_csv
|
||||
spec_path = args.bicorder
|
||||
output_dir = args.output_dir if args.output_dir else str(Path(args.input_csv).parent / 'json')
|
||||
|
||||
# Create output directory
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Load template
|
||||
template = load_spec_template(spec_path)
|
||||
|
||||
# Process CSV
|
||||
with open(csv_path, 'r', encoding='utf-8') as csvfile:
|
||||
reader = csv.DictReader(csvfile)
|
||||
|
||||
count = 0
|
||||
for i, row in enumerate(reader, start=1):
|
||||
# Create JSON object
|
||||
json_obj = create_json_from_row(row, template)
|
||||
|
||||
# Generate filename from protocol name
|
||||
protocol_name = row["Descriptor"]
|
||||
# Sanitize filename
|
||||
filename = protocol_name.replace("/", "_").replace("\\", "_")
|
||||
filename = f"{i:03d}_{filename}.json"
|
||||
|
||||
# Write to file
|
||||
output_path = os.path.join(output_dir, filename)
|
||||
with open(output_path, 'w', encoding='utf-8') as jsonfile:
|
||||
json.dump(json_obj, jsonfile, indent=2)
|
||||
|
||||
count += 1
|
||||
if count % 50 == 0:
|
||||
print(f"Processed {count} protocols...")
|
||||
|
||||
print(f"\nConversion complete! Created {count} JSON files in {output_dir}/")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
172
analysis/scripts/export_model_for_js.py
Normal file
172
analysis/scripts/export_model_for_js.py
Normal file
@@ -0,0 +1,172 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Export the cluster classification model to JSON for use in JavaScript.
|
||||
|
||||
Reads dimension names directly from bicorder.json so the model always
|
||||
stays in sync with the current bicorder structure.
|
||||
|
||||
When gradients are renamed in bicorder.json, add the old→new mapping to
|
||||
COLUMN_RENAMES so the training CSV columns are correctly aligned.
|
||||
|
||||
Usage:
|
||||
python3 scripts/export_model_for_js.py data/readings/synthetic_20251116/readings.csv
|
||||
python3 scripts/export_model_for_js.py data/readings/manual_20260101/readings.csv --output bicorder_model.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
|
||||
# Path to bicorder.json (relative to this script)
|
||||
BICORDER_JSON = Path(__file__).parent.parent.parent / 'bicorder.json'
|
||||
|
||||
# Historical column renames: maps old CSV column names → current bicorder.json names.
|
||||
# Add an entry here whenever gradient terms are renamed in bicorder.json.
|
||||
COLUMN_RENAMES = {
|
||||
'Design_elite_vs_vernacular': 'Design_institutional_vs_vernacular',
|
||||
'Entanglement_exclusive_vs_non-exclusive': 'Entanglement_monopolistic_vs_pluralistic',
|
||||
'Experience_sufficient_vs_insufficient': 'Experience_sufficient_vs_limited',
|
||||
'Experience_Kafka_vs_Whitehead': 'Experience_restraining_vs_liberating',
|
||||
}
|
||||
|
||||
|
||||
def load_bicorder_dimensions(bicorder_path):
|
||||
"""Read DIMENSIONS and KEY_DIMENSIONS from bicorder.json."""
|
||||
with open(bicorder_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
dimensions = []
|
||||
key_dimensions = []
|
||||
|
||||
for category in data['diagnostic']:
|
||||
set_name = category['set_name']
|
||||
for gradient in category['gradients']:
|
||||
dim_name = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
|
||||
dimensions.append(dim_name)
|
||||
if gradient.get('shortform', False):
|
||||
key_dimensions.append(dim_name)
|
||||
|
||||
return dimensions, key_dimensions, data['version']
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Export cluster classification model to JSON for JavaScript',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Example usage:
|
||||
python3 scripts/export_model_for_js.py data/readings/synthetic_20251116/readings.csv
|
||||
python3 scripts/export_model_for_js.py data/readings/manual_20260101/readings.csv --output bicorder_model.json
|
||||
"""
|
||||
)
|
||||
parser.add_argument('input_csv', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
|
||||
parser.add_argument('--output', default='bicorder_model.json',
|
||||
help='Output model JSON path (default: bicorder_model.json)')
|
||||
args = parser.parse_args()
|
||||
|
||||
dataset_dir = Path(args.input_csv).parent
|
||||
analysis_dir = dataset_dir / 'analysis'
|
||||
|
||||
# Derive dimensions and version from bicorder.json
|
||||
DIMENSIONS, KEY_DIMENSIONS, BICORDER_VERSION = load_bicorder_dimensions(BICORDER_JSON)
|
||||
|
||||
print(f"Loaded bicorder.json v{BICORDER_VERSION}")
|
||||
print(f"Dimensions: {len(DIMENSIONS)}, key dimensions: {len(KEY_DIMENSIONS)}")
|
||||
|
||||
# Load data
|
||||
df = pd.read_csv(args.input_csv)
|
||||
clusters = pd.read_csv(analysis_dir / 'data' / 'kmeans_clusters.csv')
|
||||
|
||||
# Rename old column names to match current bicorder.json
|
||||
df = df.rename(columns=COLUMN_RENAMES)
|
||||
|
||||
# Remove duplicates
|
||||
df = df.drop_duplicates(subset='Descriptor', keep='first')
|
||||
|
||||
# Merge and clean
|
||||
merged = df.merge(clusters, on='Descriptor')
|
||||
merged_clean = merged.dropna(subset=DIMENSIONS)
|
||||
|
||||
print(f"Training on {len(merged_clean)} protocols")
|
||||
|
||||
# Prepare training data
|
||||
X = merged_clean[DIMENSIONS].values
|
||||
y = merged_clean['cluster'].values
|
||||
|
||||
# Fit scaler
|
||||
scaler = StandardScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
|
||||
# Fit LDA
|
||||
lda = LinearDiscriminantAnalysis(n_components=1)
|
||||
lda.fit(X_scaled, y)
|
||||
|
||||
# Calculate cluster centroids in scaled space
|
||||
cluster_centroids = {}
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = X_scaled[y == cluster_id]
|
||||
cluster_centroids[cluster_id] = cluster_data.mean(axis=0).tolist()
|
||||
|
||||
# Calculate cluster means in original space (for reference)
|
||||
cluster_means_original = {}
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data_original = X[y == cluster_id]
|
||||
cluster_means_original[cluster_id] = cluster_data_original.mean(axis=0).tolist()
|
||||
|
||||
# Build model export
|
||||
model = {
|
||||
'version': '1.0',
|
||||
'bicorder_version': BICORDER_VERSION,
|
||||
'generated': pd.Timestamp.now().isoformat(),
|
||||
'dimensions': DIMENSIONS,
|
||||
'key_dimensions': KEY_DIMENSIONS,
|
||||
'cluster_names': {
|
||||
'1': 'Relational/Cultural',
|
||||
'2': 'Institutional/Bureaucratic'
|
||||
},
|
||||
'cluster_descriptions': {
|
||||
'1': 'Community-based, emergent, voluntary, cultural protocols',
|
||||
'2': 'Formal, institutional, top-down, bureaucratic protocols'
|
||||
},
|
||||
'scaler': {
|
||||
'mean': scaler.mean_.tolist(),
|
||||
'scale': scaler.scale_.tolist()
|
||||
},
|
||||
'lda': {
|
||||
'coefficients': lda.coef_[0].tolist(),
|
||||
'intercept': lda.intercept_[0]
|
||||
},
|
||||
'cluster_centroids_scaled': cluster_centroids,
|
||||
'cluster_means_original': cluster_means_original,
|
||||
'thresholds': {
|
||||
'confidence_low': 0.6,
|
||||
'completeness_low': 0.5,
|
||||
'boundary_distance_low': 0.5
|
||||
},
|
||||
'metadata': {
|
||||
'total_protocols': len(merged_clean),
|
||||
'cluster_1_count': int((y == 1).sum()),
|
||||
'cluster_2_count': int((y == 2).sum()),
|
||||
}
|
||||
}
|
||||
|
||||
# Save to JSON
|
||||
with open(args.output, 'w') as f:
|
||||
json.dump(model, f, indent=2)
|
||||
|
||||
print(f"\nModel exported to {args.output}")
|
||||
print(f"Bicorder version: {BICORDER_VERSION}")
|
||||
print(f"Total dimensions: {len(DIMENSIONS)}")
|
||||
print(f"Key dimensions (short form):")
|
||||
for dim in KEY_DIMENSIONS:
|
||||
print(f" - {dim}")
|
||||
print(f"Model size: {len(json.dumps(model))} bytes")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
164
analysis/scripts/json_to_csv.py
Normal file
164
analysis/scripts/json_to_csv.py
Normal file
@@ -0,0 +1,164 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convert a directory of individual bicorder JSON reading files into a diagnostic CSV.
|
||||
|
||||
This is the reverse of convert_csv_to_json.py. Each JSON file becomes one row.
|
||||
Handles readings across bicorder versions by matching on term_left/term_right pairs
|
||||
rather than column names.
|
||||
|
||||
Null gradient values (e.g., shortform readings that skip non-key dimensions) are
|
||||
written as empty cells so downstream analysis can treat them as NaN.
|
||||
|
||||
Usage:
|
||||
python3 scripts/json_to_csv.py data/readings/manual_20260320/json/
|
||||
python3 scripts/json_to_csv.py data/readings/manual_20260320/json/ -o data/readings/manual_20260320/readings.csv
|
||||
python3 scripts/json_to_csv.py data/readings/manual_20260320/json/ --shortform-only
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# Map old term pairs to current column names (matches COLUMN_RENAMES in other scripts).
|
||||
# Keys are (term_left, term_right) as found in older JSON files.
|
||||
TERM_RENAMES = {
|
||||
('elite', 'vernacular'): ('institutional', 'vernacular'),
|
||||
('exclusive', 'non-exclusive'): ('monopolistic', 'pluralistic'),
|
||||
('insufficient', 'sufficient'): ('sufficient', 'limited'), # note: order swapped in old versions
|
||||
('Kafka', 'Whitehead'): ('restraining', 'liberating'),
|
||||
}
|
||||
|
||||
|
||||
def load_bicorder_columns(bicorder_path):
|
||||
"""Read ordered column definitions from bicorder.json."""
|
||||
with open(bicorder_path) as f:
|
||||
data = json.load(f)
|
||||
columns = []
|
||||
key_columns = set()
|
||||
for category in data['diagnostic']:
|
||||
set_name = category['set_name']
|
||||
for gradient in category['gradients']:
|
||||
col = f"{set_name}_{gradient['term_left']}_vs_{gradient['term_right']}"
|
||||
columns.append(col)
|
||||
if gradient.get('shortform', False):
|
||||
key_columns.add(col)
|
||||
return columns, key_columns
|
||||
|
||||
|
||||
def normalize_terms(term_left, term_right):
|
||||
"""Apply renames to match current bicorder.json terminology."""
|
||||
pair = (term_left, term_right)
|
||||
if pair in TERM_RENAMES:
|
||||
return TERM_RENAMES[pair]
|
||||
# Also check reversed pair (some old files had swapped left/right)
|
||||
reversed_pair = (term_right, term_left)
|
||||
if reversed_pair in TERM_RENAMES:
|
||||
new_left, new_right = TERM_RENAMES[reversed_pair]
|
||||
return new_right, new_left # swap back
|
||||
return term_left, term_right
|
||||
|
||||
|
||||
def json_to_row(json_path, all_columns):
|
||||
"""Convert a single JSON reading file to a CSV row dict."""
|
||||
with open(json_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
meta = data.get('metadata', {})
|
||||
row = {
|
||||
'Descriptor': meta.get('protocol', ''),
|
||||
'Description': '', # not stored in individual reading files
|
||||
'analyst': meta.get('analyst', ''),
|
||||
'standpoint': meta.get('standpoint', ''),
|
||||
'timestamp': meta.get('timestamp', ''),
|
||||
'shortform': str(meta.get('shortform', '')),
|
||||
'version': data.get('version', ''),
|
||||
}
|
||||
|
||||
# Build lookup: (normalized_term_left, normalized_term_right) -> value
|
||||
gradient_values = {}
|
||||
for category in data.get('diagnostic', []):
|
||||
set_name = category['set_name']
|
||||
for gradient in category.get('gradients', []):
|
||||
tl = gradient['term_left']
|
||||
tr = gradient['term_right']
|
||||
tl_norm, tr_norm = normalize_terms(tl, tr)
|
||||
col = f"{set_name}_{tl_norm}_vs_{tr_norm}"
|
||||
value = gradient.get('value')
|
||||
gradient_values[col] = '' if value is None else str(value)
|
||||
|
||||
for col in all_columns:
|
||||
row[col] = gradient_values.get(col, '')
|
||||
|
||||
return row
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Convert directory of bicorder JSON files to a diagnostic CSV',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Example usage:
|
||||
python3 scripts/json_to_csv.py data/readings/manual_20260320/json/
|
||||
python3 scripts/json_to_csv.py data/readings/manual_20260320/json/ --shortform-only
|
||||
python3 scripts/json_to_csv.py data/readings/manual_20260320/json/ -o data/readings/manual_20260320/readings.csv
|
||||
"""
|
||||
)
|
||||
parser.add_argument('json_dir', help='Directory containing bicorder JSON reading files')
|
||||
parser.add_argument('-o', '--output', default=None,
|
||||
help='Output CSV path (default: <dataset_dir>/readings.csv)')
|
||||
parser.add_argument('-b', '--bicorder', default='../bicorder.json',
|
||||
help='Path to bicorder.json (default: ../bicorder.json)')
|
||||
parser.add_argument('--shortform-only', action='store_true',
|
||||
help='Include only the key shortform dimensions (useful when most readings are shortform)')
|
||||
args = parser.parse_args()
|
||||
|
||||
json_dir = Path(args.json_dir)
|
||||
dataset_dir = json_dir.parent
|
||||
output_path = Path(args.output) if args.output else dataset_dir / 'readings.csv'
|
||||
|
||||
all_columns, key_columns = load_bicorder_columns(args.bicorder)
|
||||
|
||||
if args.shortform_only:
|
||||
columns = [c for c in all_columns if c in key_columns]
|
||||
print(f"Shortform mode: using {len(columns)} key dimensions")
|
||||
else:
|
||||
columns = all_columns
|
||||
|
||||
json_files = sorted(json_dir.glob('*.json'))
|
||||
if not json_files:
|
||||
print(f"Error: no JSON files found in {json_dir}")
|
||||
return
|
||||
|
||||
print(f"Converting {len(json_files)} JSON files → {output_path}")
|
||||
|
||||
fieldnames = ['Descriptor', 'Description', 'analyst', 'standpoint',
|
||||
'timestamp', 'shortform', 'version'] + columns
|
||||
|
||||
rows = []
|
||||
for json_path in json_files:
|
||||
try:
|
||||
row = json_to_row(json_path, columns)
|
||||
rows.append(row)
|
||||
except Exception as e:
|
||||
print(f" Warning: skipping {json_path.name}: {e}")
|
||||
|
||||
with open(output_path, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
|
||||
# Summary stats
|
||||
filled = {col: sum(1 for r in rows if r.get(col)) for col in columns}
|
||||
print(f"Done. {len(rows)} rows written.")
|
||||
print(f"\nDimension coverage (readings with a value):")
|
||||
for col, count in filled.items():
|
||||
pct = count / len(rows) * 100 if rows else 0
|
||||
marker = '* ' if col in key_columns else ' '
|
||||
print(f" {marker}{col}: {count}/{len(rows)} ({pct:.0f}%)")
|
||||
print(f"\n(* = shortform/key dimension)")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
177
analysis/scripts/lda_visualization.py
Normal file
177
analysis/scripts/lda_visualization.py
Normal file
@@ -0,0 +1,177 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Create LDA visualization to maximize cluster separation.
|
||||
|
||||
Usage:
|
||||
python3 scripts/lda_visualization.py data/readings/synthetic_20251116.csv
|
||||
python3 scripts/lda_visualization.py data/readings/synthetic_20251116.csv --results-dir analysis_results/synthetic_20251116
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Create LDA visualization of cluster separation',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Example usage:
|
||||
python3 scripts/lda_visualization.py data/readings/synthetic_20251116/readings.csv
|
||||
python3 scripts/lda_visualization.py data/readings/manual_20260101/readings.csv --analysis-dir data/readings/manual_20260101/analysis
|
||||
"""
|
||||
)
|
||||
parser.add_argument('input_csv', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
|
||||
parser.add_argument('--analysis-dir', default=None,
|
||||
help='Analysis directory (default: <dataset_dir>/analysis)')
|
||||
args = parser.parse_args()
|
||||
|
||||
dataset_dir = Path(args.input_csv).parent
|
||||
results_dir = Path(args.analysis_dir) if args.analysis_dir else dataset_dir / 'analysis'
|
||||
plots_dir = results_dir / 'plots'
|
||||
data_dir = results_dir / 'data'
|
||||
|
||||
# Load the original data
|
||||
df = pd.read_csv(args.input_csv)
|
||||
|
||||
# Identify dimension columns
|
||||
all_cols = df.columns.tolist()
|
||||
design_cols = [c for c in all_cols if c.startswith('Design_')]
|
||||
entanglement_cols = [c for c in all_cols if c.startswith('Entanglement_')]
|
||||
experience_cols = [c for c in all_cols if c.startswith('Experience_')]
|
||||
dimension_cols = design_cols + entanglement_cols + experience_cols
|
||||
|
||||
# Load cluster assignments
|
||||
clusters = pd.read_csv(data_dir / 'kmeans_clusters.csv')
|
||||
df_with_clusters = df.merge(clusters, on='Descriptor')
|
||||
|
||||
# Drop dimension columns with low coverage (< 80%) to handle shortform datasets
|
||||
n = len(df_with_clusters)
|
||||
coverage = df_with_clusters[dimension_cols].notna().sum() / n
|
||||
dimension_cols = [c for c in dimension_cols if coverage[c] >= 0.8]
|
||||
|
||||
# Prepare data — impute any remaining NaNs with column median
|
||||
X_df = df_with_clusters[dimension_cols].copy()
|
||||
X_df = X_df.fillna(X_df.median())
|
||||
X = X_df.values
|
||||
y = df_with_clusters['cluster'].values
|
||||
|
||||
# Standardize
|
||||
scaler = StandardScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
|
||||
# Fit LDA (with 1 component for 2 classes)
|
||||
lda = LinearDiscriminantAnalysis(n_components=1)
|
||||
X_lda = lda.fit_transform(X_scaled, y).ravel()
|
||||
|
||||
# Create histogram showing separation
|
||||
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))
|
||||
|
||||
# Histogram
|
||||
colors = {1: '#2E86AB', 2: '#A23B72'}
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = X_lda[y == cluster_id]
|
||||
ax1.hist(cluster_data, bins=30, alpha=0.6,
|
||||
color=colors[cluster_id],
|
||||
label=f'Cluster {cluster_id}',
|
||||
edgecolor='white', linewidth=0.5)
|
||||
|
||||
ax1.set_xlabel('Linear Discriminant (LD1)', fontsize=12)
|
||||
ax1.set_ylabel('Frequency', fontsize=12)
|
||||
ax1.set_title('Linear Discriminant Analysis: Cluster Separation\n(Maximum separation projection)',
|
||||
fontsize=14, fontweight='bold')
|
||||
ax1.legend(fontsize=11)
|
||||
ax1.grid(True, alpha=0.3, axis='y')
|
||||
|
||||
# Strip plot - shows individual protocols
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = X_lda[y == cluster_id]
|
||||
cluster_protocols = df_with_clusters[df_with_clusters['cluster'] == cluster_id]['Descriptor'].values
|
||||
|
||||
# Add jitter for visibility
|
||||
y_jitter = np.random.normal(cluster_id, 0.1, size=len(cluster_data))
|
||||
|
||||
ax2.scatter(cluster_data, y_jitter,
|
||||
c=colors[cluster_id], alpha=0.5, s=40,
|
||||
edgecolors='white', linewidth=0.3)
|
||||
|
||||
# Label a few representative protocols
|
||||
for i in range(0, len(cluster_data), 25):
|
||||
ax2.annotate(cluster_protocols[i],
|
||||
(cluster_data[i], y_jitter[i]),
|
||||
fontsize=7, alpha=0.7,
|
||||
xytext=(0, 5), textcoords='offset points',
|
||||
rotation=45, ha='left')
|
||||
|
||||
ax2.set_xlabel('Linear Discriminant (LD1)', fontsize=12)
|
||||
ax2.set_ylabel('Cluster', fontsize=12)
|
||||
ax2.set_yticks([1, 2])
|
||||
ax2.set_yticklabels(['Cluster 1:\nRelational/Cultural', 'Cluster 2:\nInstitutional/Bureaucratic'])
|
||||
ax2.set_title('Individual Protocols Projected onto Discriminant Axis', fontsize=12)
|
||||
ax2.grid(True, alpha=0.3, axis='x')
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(plots_dir / 'lda_cluster_separation.png', dpi=300, bbox_inches='tight')
|
||||
print(f"Saved: {plots_dir / 'lda_cluster_separation.png'}")
|
||||
|
||||
# Calculate separation metrics
|
||||
mean_1 = X_lda[y == 1].mean()
|
||||
mean_2 = X_lda[y == 2].mean()
|
||||
std_1 = X_lda[y == 1].std()
|
||||
std_2 = X_lda[y == 2].std()
|
||||
|
||||
# Cohen's d (effect size)
|
||||
pooled_std = np.sqrt((std_1**2 + std_2**2) / 2)
|
||||
cohens_d = abs(mean_1 - mean_2) / pooled_std
|
||||
|
||||
print(f"\n=== Cluster Separation Statistics ===")
|
||||
mean_1_val = mean_1[0] if isinstance(mean_1, np.ndarray) else mean_1
|
||||
mean_2_val = mean_2[0] if isinstance(mean_2, np.ndarray) else mean_2
|
||||
cohens_d_val = cohens_d[0] if isinstance(cohens_d, np.ndarray) else cohens_d
|
||||
print(f"Cluster 1 mean: {mean_1_val:.3f} (std: {std_1:.3f})")
|
||||
print(f"Cluster 2 mean: {mean_2_val:.3f} (std: {std_2:.3f})")
|
||||
print(f"Distance between means: {abs(mean_1_val - mean_2_val):.3f}")
|
||||
print(f"Cohen's d (effect size): {cohens_d_val:.3f}")
|
||||
print(f" (0.2=small, 0.5=medium, 0.8=large effect)")
|
||||
|
||||
# Overlap percentage (rough estimate)
|
||||
overlap_start = max(X_lda[y == 1].min(), X_lda[y == 2].min())
|
||||
overlap_end = min(X_lda[y == 1].max(), X_lda[y == 2].max())
|
||||
overlap_range = overlap_end - overlap_start if overlap_end > overlap_start else 0
|
||||
total_range = X_lda.max() - X_lda.min()
|
||||
overlap_pct = (overlap_range / total_range) * 100 if overlap_range > 0 else 0
|
||||
|
||||
print(f"Approximate overlap: {overlap_pct:.1f}% of total range")
|
||||
|
||||
# Save LDA projection data
|
||||
lda_df = pd.DataFrame({
|
||||
'Descriptor': df_with_clusters['Descriptor'],
|
||||
'LD1': X_lda.flatten(),
|
||||
'Cluster': y
|
||||
})
|
||||
lda_df.to_csv(data_dir / 'lda_projection.csv', index=False)
|
||||
print(f"Saved: {data_dir / 'lda_projection.csv'}")
|
||||
|
||||
print("\n=== Most discriminating dimensions ===")
|
||||
loadings = pd.DataFrame({
|
||||
'Dimension': dimension_cols,
|
||||
'LDA_Coefficient': lda.coef_[0]
|
||||
})
|
||||
loadings['Abs_Coefficient'] = loadings['LDA_Coefficient'].abs()
|
||||
loadings = loadings.sort_values('Abs_Coefficient', ascending=False)
|
||||
|
||||
print("\nTop 10 dimensions that separate the clusters:")
|
||||
for _, row in loadings.head(10).iterrows():
|
||||
print(f" {row['Dimension']}: {row['LDA_Coefficient']:.3f}")
|
||||
|
||||
loadings.to_csv(data_dir / 'lda_coefficients.csv', index=False)
|
||||
print(f"\nSaved: {data_dir / 'lda_coefficients.csv'}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
858
analysis/scripts/multivariate_analysis.py
Normal file
858
analysis/scripts/multivariate_analysis.py
Normal file
@@ -0,0 +1,858 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Multivariate Analysis Script for Protocol Bicorder Data
|
||||
|
||||
Performs comprehensive multivariate statistical analyses on protocol diagnostic data,
|
||||
including clustering, dimensionality reduction, correlation analysis, and visualization.
|
||||
|
||||
Usage:
|
||||
python3 multivariate_analysis.py diagnostic_output.csv [--analyses all]
|
||||
python3 multivariate_analysis.py diagnostic_output.csv --analyses clustering pca
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
|
||||
from scipy.spatial.distance import pdist, squareform
|
||||
import networkx as nx
|
||||
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.decomposition import PCA, FactorAnalysis
|
||||
from sklearn.manifold import TSNE
|
||||
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
from sklearn.metrics import silhouette_score, davies_bouldin_score
|
||||
|
||||
try:
|
||||
import umap
|
||||
UMAP_AVAILABLE = True
|
||||
except ImportError:
|
||||
UMAP_AVAILABLE = False
|
||||
print("Note: UMAP not available. Install with: pip install umap-learn")
|
||||
|
||||
|
||||
class ProtocolAnalyzer:
|
||||
"""Main class for multivariate analysis of protocol data."""
|
||||
|
||||
def __init__(self, csv_path, output_dir='analysis_results', min_coverage=0.0):
|
||||
"""Initialize analyzer with data and output directory.
|
||||
|
||||
Args:
|
||||
csv_path: Path to diagnostic CSV file
|
||||
output_dir: Directory for analysis output
|
||||
min_coverage: Drop dimension columns with fewer than this fraction of
|
||||
non-null values (0.0–1.0). Useful for sparse/shortform
|
||||
datasets. E.g. 0.8 keeps only columns with ≥80% coverage.
|
||||
"""
|
||||
self.csv_path = Path(csv_path)
|
||||
self.output_dir = Path(output_dir)
|
||||
self.min_coverage = min_coverage
|
||||
self.output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Create subdirectories
|
||||
(self.output_dir / 'plots').mkdir(exist_ok=True)
|
||||
(self.output_dir / 'data').mkdir(exist_ok=True)
|
||||
(self.output_dir / 'reports').mkdir(exist_ok=True)
|
||||
|
||||
# Load and prepare data
|
||||
self.df = None
|
||||
self.dimension_cols = []
|
||||
self.design_cols = []
|
||||
self.entanglement_cols = []
|
||||
self.experience_cols = []
|
||||
self.scaled_data = None
|
||||
self.scaler = None
|
||||
|
||||
self._load_data()
|
||||
|
||||
def _load_data(self):
|
||||
"""Load CSV and identify dimension columns."""
|
||||
print(f"Loading data from {self.csv_path}...")
|
||||
self.df = pd.read_csv(self.csv_path)
|
||||
|
||||
# Identify dimension columns
|
||||
all_cols = self.df.columns.tolist()
|
||||
self.design_cols = [c for c in all_cols if c.startswith('Design_')]
|
||||
self.entanglement_cols = [c for c in all_cols if c.startswith('Entanglement_')]
|
||||
self.experience_cols = [c for c in all_cols if c.startswith('Experience_')]
|
||||
self.dimension_cols = self.design_cols + self.entanglement_cols + self.experience_cols
|
||||
|
||||
print(f"Loaded {len(self.df)} protocols with {len(self.dimension_cols)} dimensions")
|
||||
print(f" - Design: {len(self.design_cols)}")
|
||||
print(f" - Entanglement: {len(self.entanglement_cols)}")
|
||||
print(f" - Experience: {len(self.experience_cols)}")
|
||||
|
||||
# Drop low-coverage columns if min_coverage is set
|
||||
if self.min_coverage > 0.0:
|
||||
n = len(self.df)
|
||||
coverage = self.df[self.dimension_cols].notna().sum() / n
|
||||
dropped = [c for c in self.dimension_cols if coverage[c] < self.min_coverage]
|
||||
if dropped:
|
||||
print(f"\nDropping {len(dropped)} dimension(s) below {self.min_coverage:.0%} coverage:")
|
||||
for c in dropped:
|
||||
print(f" - {c}: {coverage[c]:.0%}")
|
||||
self.dimension_cols = [c for c in self.dimension_cols if c not in dropped]
|
||||
self.design_cols = [c for c in self.design_cols if c not in dropped]
|
||||
self.entanglement_cols = [c for c in self.entanglement_cols if c not in dropped]
|
||||
self.experience_cols = [c for c in self.experience_cols if c not in dropped]
|
||||
print(f"Remaining dimensions: {len(self.dimension_cols)}")
|
||||
|
||||
# Check for missing values
|
||||
missing_count = self.df[self.dimension_cols].isna().sum().sum()
|
||||
rows_with_missing = self.df[self.dimension_cols].isna().any(axis=1).sum()
|
||||
|
||||
if missing_count > 0:
|
||||
print(f"\nWarning: Found {missing_count} missing values in {rows_with_missing} rows")
|
||||
print("Dropping rows with missing values...")
|
||||
self.df = self.df.dropna(subset=self.dimension_cols)
|
||||
print(f"Dataset now contains {len(self.df)} protocols")
|
||||
|
||||
# Standardize the dimension data
|
||||
self.scaler = StandardScaler()
|
||||
self.scaled_data = self.scaler.fit_transform(self.df[self.dimension_cols])
|
||||
|
||||
def save_results(self, data, filename, subdir='data'):
|
||||
"""Save results to CSV file."""
|
||||
output_path = self.output_dir / subdir / filename
|
||||
if isinstance(data, pd.DataFrame):
|
||||
data.to_csv(output_path, index=False)
|
||||
else:
|
||||
pd.DataFrame(data).to_csv(output_path)
|
||||
print(f" Saved: {output_path}")
|
||||
|
||||
def save_plot(self, filename):
|
||||
"""Save current matplotlib figure."""
|
||||
output_path = self.output_dir / 'plots' / filename
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_path, dpi=300, bbox_inches='tight')
|
||||
print(f" Saved: {output_path}")
|
||||
plt.close()
|
||||
|
||||
# ========== CLUSTERING ANALYSES ==========
|
||||
|
||||
def kmeans_clustering(self, n_clusters_range=(2, 10)):
|
||||
"""Perform K-means clustering with elbow method."""
|
||||
print("\n=== K-Means Clustering ===")
|
||||
|
||||
# Elbow method
|
||||
inertias = []
|
||||
silhouettes = []
|
||||
k_range = range(n_clusters_range[0], n_clusters_range[1] + 1)
|
||||
|
||||
for k in k_range:
|
||||
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
|
||||
labels = kmeans.fit_predict(self.scaled_data)
|
||||
inertias.append(kmeans.inertia_)
|
||||
if k > 1:
|
||||
silhouettes.append(silhouette_score(self.scaled_data, labels))
|
||||
else:
|
||||
silhouettes.append(0)
|
||||
|
||||
# Plot elbow curve
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
|
||||
|
||||
ax1.plot(k_range, inertias, 'bo-')
|
||||
ax1.set_xlabel('Number of Clusters (k)')
|
||||
ax1.set_ylabel('Inertia')
|
||||
ax1.set_title('Elbow Method for Optimal k')
|
||||
ax1.grid(True, alpha=0.3)
|
||||
|
||||
ax2.plot(k_range, silhouettes, 'ro-')
|
||||
ax2.set_xlabel('Number of Clusters (k)')
|
||||
ax2.set_ylabel('Silhouette Score')
|
||||
ax2.set_title('Silhouette Score by k')
|
||||
ax2.grid(True, alpha=0.3)
|
||||
|
||||
self.save_plot('kmeans_elbow.png')
|
||||
|
||||
# Use optimal k (highest silhouette)
|
||||
optimal_k = k_range[np.argmax(silhouettes)]
|
||||
print(f"Optimal k by silhouette score: {optimal_k}")
|
||||
|
||||
# Final clustering
|
||||
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
|
||||
self.df['kmeans_cluster'] = kmeans.fit_predict(self.scaled_data)
|
||||
|
||||
# Save results
|
||||
results = self.df[['Descriptor', 'kmeans_cluster']].copy()
|
||||
results['cluster'] = results['kmeans_cluster'] + 1 # 1-indexed for readability
|
||||
self.save_results(results[['Descriptor', 'cluster']], 'kmeans_clusters.csv')
|
||||
|
||||
# Cluster statistics
|
||||
print(f"\nCluster sizes:")
|
||||
print(self.df['kmeans_cluster'].value_counts().sort_index())
|
||||
|
||||
return self.df['kmeans_cluster']
|
||||
|
||||
def hierarchical_clustering(self, n_clusters=5, method='ward'):
|
||||
"""Perform hierarchical clustering with dendrogram."""
|
||||
print("\n=== Hierarchical Clustering ===")
|
||||
|
||||
# Compute linkage
|
||||
Z = linkage(self.scaled_data, method=method)
|
||||
|
||||
# Plot dendrogram
|
||||
plt.figure(figsize=(16, 8))
|
||||
dendrogram(Z, labels=self.df['Descriptor'].values, leaf_font_size=8)
|
||||
plt.title(f'Hierarchical Clustering Dendrogram ({method} linkage)')
|
||||
plt.xlabel('Protocol')
|
||||
plt.ylabel('Distance')
|
||||
plt.xticks(rotation=90)
|
||||
self.save_plot('hierarchical_dendrogram.png')
|
||||
|
||||
# Cut tree to get clusters
|
||||
self.df['hierarchical_cluster'] = fcluster(Z, n_clusters, criterion='maxclust')
|
||||
|
||||
# Save results
|
||||
results = self.df[['Descriptor', 'hierarchical_cluster']].copy()
|
||||
results.columns = ['Descriptor', 'cluster']
|
||||
self.save_results(results, 'hierarchical_clusters.csv')
|
||||
|
||||
print(f"\nCluster sizes:")
|
||||
print(self.df['hierarchical_cluster'].value_counts().sort_index())
|
||||
|
||||
return self.df['hierarchical_cluster']
|
||||
|
||||
def dbscan_clustering(self, eps=3.0, min_samples=3):
|
||||
"""Perform DBSCAN clustering to identify outliers."""
|
||||
print("\n=== DBSCAN Clustering ===")
|
||||
|
||||
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
|
||||
self.df['dbscan_cluster'] = dbscan.fit_predict(self.scaled_data)
|
||||
|
||||
n_clusters = len(set(self.df['dbscan_cluster'])) - (1 if -1 in self.df['dbscan_cluster'] else 0)
|
||||
n_outliers = (self.df['dbscan_cluster'] == -1).sum()
|
||||
|
||||
print(f"Found {n_clusters} clusters and {n_outliers} outliers")
|
||||
|
||||
# Save results
|
||||
results = self.df[['Descriptor', 'dbscan_cluster']].copy()
|
||||
results.columns = ['Descriptor', 'cluster']
|
||||
self.save_results(results, 'dbscan_clusters.csv')
|
||||
|
||||
if n_outliers > 0:
|
||||
outliers = self.df[self.df['dbscan_cluster'] == -1][['Descriptor']]
|
||||
self.save_results(outliers, 'dbscan_outliers.csv')
|
||||
print("\nOutlier protocols:")
|
||||
for protocol in outliers['Descriptor']:
|
||||
print(f" - {protocol}")
|
||||
|
||||
return self.df['dbscan_cluster']
|
||||
|
||||
# ========== DIMENSIONALITY REDUCTION ==========
|
||||
|
||||
def pca_analysis(self, n_components=None):
|
||||
"""Perform PCA and visualize results."""
|
||||
print("\n=== Principal Component Analysis ===")
|
||||
|
||||
# Fit PCA
|
||||
if n_components is None:
|
||||
pca = PCA()
|
||||
else:
|
||||
pca = PCA(n_components=n_components)
|
||||
|
||||
pca_coords = pca.fit_transform(self.scaled_data)
|
||||
|
||||
# Explained variance
|
||||
explained_var = pca.explained_variance_ratio_
|
||||
cumsum_var = np.cumsum(explained_var)
|
||||
|
||||
print(f"First 5 PCs explain {cumsum_var[4]*100:.1f}% of variance")
|
||||
|
||||
# Plot explained variance
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
|
||||
|
||||
n_show = min(15, len(explained_var))
|
||||
ax1.bar(range(1, n_show + 1), explained_var[:n_show])
|
||||
ax1.set_xlabel('Principal Component')
|
||||
ax1.set_ylabel('Explained Variance Ratio')
|
||||
ax1.set_title('Variance Explained by Each PC')
|
||||
ax1.grid(True, alpha=0.3, axis='y')
|
||||
|
||||
ax2.plot(range(1, n_show + 1), cumsum_var[:n_show], 'o-')
|
||||
ax2.axhline(y=0.8, color='r', linestyle='--', alpha=0.5, label='80% threshold')
|
||||
ax2.set_xlabel('Number of Components')
|
||||
ax2.set_ylabel('Cumulative Explained Variance')
|
||||
ax2.set_title('Cumulative Variance Explained')
|
||||
ax2.legend()
|
||||
ax2.grid(True, alpha=0.3)
|
||||
|
||||
self.save_plot('pca_variance.png')
|
||||
|
||||
# 2D visualization
|
||||
plt.figure(figsize=(12, 10))
|
||||
plt.scatter(pca_coords[:, 0], pca_coords[:, 1], alpha=0.6, s=50)
|
||||
|
||||
# Annotate points
|
||||
for i, protocol in enumerate(self.df['Descriptor']):
|
||||
if i % 3 == 0: # Label every 3rd point to avoid clutter
|
||||
plt.annotate(protocol, (pca_coords[i, 0], pca_coords[i, 1]),
|
||||
fontsize=6, alpha=0.7)
|
||||
|
||||
plt.xlabel(f'PC1 ({explained_var[0]*100:.1f}% variance)')
|
||||
plt.ylabel(f'PC2 ({explained_var[1]*100:.1f}% variance)')
|
||||
plt.title('Protocols in PCA Space (First 2 Components)')
|
||||
plt.grid(True, alpha=0.3)
|
||||
self.save_plot('pca_2d.png')
|
||||
|
||||
# Save PCA coordinates
|
||||
pca_df = pd.DataFrame(pca_coords[:, :5],
|
||||
columns=[f'PC{i+1}' for i in range(min(5, pca_coords.shape[1]))])
|
||||
pca_df.insert(0, 'Descriptor', self.df['Descriptor'])
|
||||
self.save_results(pca_df, 'pca_coordinates.csv')
|
||||
|
||||
# Component loadings
|
||||
loadings = pd.DataFrame(
|
||||
pca.components_[:5, :].T,
|
||||
columns=[f'PC{i+1}' for i in range(min(5, pca.components_.shape[0]))],
|
||||
index=self.dimension_cols
|
||||
)
|
||||
self.save_results(loadings, 'pca_loadings.csv')
|
||||
|
||||
# Plot loadings heatmap
|
||||
plt.figure(figsize=(10, 12))
|
||||
sns.heatmap(loadings, cmap='RdBu_r', center=0, cbar_kws={'label': 'Loading'})
|
||||
plt.title('PCA Component Loadings')
|
||||
plt.tight_layout()
|
||||
self.save_plot('pca_loadings_heatmap.png')
|
||||
|
||||
return pca_coords, pca
|
||||
|
||||
def tsne_analysis(self, perplexity=30, n_components=2):
|
||||
"""Perform t-SNE analysis."""
|
||||
print("\n=== t-SNE Analysis ===")
|
||||
|
||||
tsne = TSNE(n_components=n_components, perplexity=perplexity, random_state=42, max_iter=1000)
|
||||
tsne_coords = tsne.fit_transform(self.scaled_data)
|
||||
|
||||
# Plot
|
||||
plt.figure(figsize=(12, 10))
|
||||
plt.scatter(tsne_coords[:, 0], tsne_coords[:, 1], alpha=0.6, s=50)
|
||||
|
||||
# Annotate some points
|
||||
for i, protocol in enumerate(self.df['Descriptor']):
|
||||
if i % 4 == 0: # Label every 4th point
|
||||
plt.annotate(protocol, (tsne_coords[i, 0], tsne_coords[i, 1]),
|
||||
fontsize=6, alpha=0.7)
|
||||
|
||||
plt.xlabel('t-SNE Dimension 1')
|
||||
plt.ylabel('t-SNE Dimension 2')
|
||||
plt.title(f't-SNE Projection (perplexity={perplexity})')
|
||||
plt.grid(True, alpha=0.3)
|
||||
self.save_plot('tsne_2d.png')
|
||||
|
||||
# Save coordinates
|
||||
tsne_df = pd.DataFrame(tsne_coords, columns=['TSNE1', 'TSNE2'])
|
||||
tsne_df.insert(0, 'Descriptor', self.df['Descriptor'])
|
||||
self.save_results(tsne_df, 'tsne_coordinates.csv')
|
||||
|
||||
return tsne_coords
|
||||
|
||||
def umap_analysis(self, n_neighbors=15, min_dist=0.1, n_components=2):
|
||||
"""Perform UMAP analysis if available."""
|
||||
if not UMAP_AVAILABLE:
|
||||
print("\n=== UMAP Analysis ===")
|
||||
print("UMAP not available. Install with: pip install umap-learn")
|
||||
return None
|
||||
|
||||
print("\n=== UMAP Analysis ===")
|
||||
|
||||
reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist,
|
||||
n_components=n_components, random_state=42)
|
||||
umap_coords = reducer.fit_transform(self.scaled_data)
|
||||
|
||||
# Plot
|
||||
plt.figure(figsize=(12, 10))
|
||||
plt.scatter(umap_coords[:, 0], umap_coords[:, 1], alpha=0.6, s=50)
|
||||
|
||||
# Annotate some points
|
||||
for i, protocol in enumerate(self.df['Descriptor']):
|
||||
if i % 4 == 0:
|
||||
plt.annotate(protocol, (umap_coords[i, 0], umap_coords[i, 1]),
|
||||
fontsize=6, alpha=0.7)
|
||||
|
||||
plt.xlabel('UMAP Dimension 1')
|
||||
plt.ylabel('UMAP Dimension 2')
|
||||
plt.title(f'UMAP Projection (n_neighbors={n_neighbors}, min_dist={min_dist})')
|
||||
plt.grid(True, alpha=0.3)
|
||||
self.save_plot('umap_2d.png')
|
||||
|
||||
# Save coordinates
|
||||
umap_df = pd.DataFrame(umap_coords, columns=['UMAP1', 'UMAP2'])
|
||||
umap_df.insert(0, 'Descriptor', self.df['Descriptor'])
|
||||
self.save_results(umap_df, 'umap_coordinates.csv')
|
||||
|
||||
return umap_coords
|
||||
|
||||
def factor_analysis(self, n_factors=5):
|
||||
"""Perform factor analysis."""
|
||||
print("\n=== Factor Analysis ===")
|
||||
|
||||
fa = FactorAnalysis(n_components=n_factors, random_state=42)
|
||||
fa_coords = fa.fit_transform(self.scaled_data)
|
||||
|
||||
# Factor loadings
|
||||
loadings = pd.DataFrame(
|
||||
fa.components_.T,
|
||||
columns=[f'Factor{i+1}' for i in range(n_factors)],
|
||||
index=self.dimension_cols
|
||||
)
|
||||
self.save_results(loadings, 'factor_loadings.csv')
|
||||
|
||||
# Plot loadings heatmap
|
||||
plt.figure(figsize=(10, 12))
|
||||
sns.heatmap(loadings, cmap='RdBu_r', center=0, cbar_kws={'label': 'Loading'})
|
||||
plt.title('Factor Analysis Loadings')
|
||||
plt.tight_layout()
|
||||
self.save_plot('factor_loadings_heatmap.png')
|
||||
|
||||
# Save factor scores
|
||||
fa_df = pd.DataFrame(fa_coords,
|
||||
columns=[f'Factor{i+1}' for i in range(n_factors)])
|
||||
fa_df.insert(0, 'Descriptor', self.df['Descriptor'])
|
||||
self.save_results(fa_df, 'factor_scores.csv')
|
||||
|
||||
return fa_coords, fa
|
||||
|
||||
# ========== CORRELATION & STRUCTURE ==========
|
||||
|
||||
def correlation_analysis(self):
|
||||
"""Compute and visualize correlation matrices."""
|
||||
print("\n=== Correlation Analysis ===")
|
||||
|
||||
# Full correlation matrix
|
||||
corr_matrix = self.df[self.dimension_cols].corr()
|
||||
|
||||
# Plot full correlation heatmap
|
||||
plt.figure(figsize=(16, 14))
|
||||
sns.heatmap(corr_matrix, cmap='RdBu_r', center=0,
|
||||
square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'})
|
||||
plt.title('Correlation Matrix - All Dimensions')
|
||||
plt.tight_layout()
|
||||
self.save_plot('correlation_heatmap_full.png')
|
||||
|
||||
# Save correlation matrix
|
||||
self.save_results(corr_matrix, 'correlation_matrix.csv')
|
||||
|
||||
# Find strongest correlations
|
||||
corr_pairs = []
|
||||
for i in range(len(corr_matrix.columns)):
|
||||
for j in range(i+1, len(corr_matrix.columns)):
|
||||
corr_pairs.append({
|
||||
'Dimension1': corr_matrix.columns[i],
|
||||
'Dimension2': corr_matrix.columns[j],
|
||||
'Correlation': corr_matrix.iloc[i, j]
|
||||
})
|
||||
|
||||
corr_df = pd.DataFrame(corr_pairs).sort_values('Correlation',
|
||||
key=abs,
|
||||
ascending=False)
|
||||
self.save_results(corr_df.head(20), 'top_correlations.csv')
|
||||
|
||||
print("\nTop 5 positive correlations:")
|
||||
for _, row in corr_df.head(5).iterrows():
|
||||
print(f" {row['Dimension1']} <-> {row['Dimension2']}: {row['Correlation']:.3f}")
|
||||
|
||||
print("\nTop 5 negative correlations:")
|
||||
for _, row in corr_df.tail(5).iterrows():
|
||||
print(f" {row['Dimension1']} <-> {row['Dimension2']}: {row['Correlation']:.3f}")
|
||||
|
||||
# Within-category correlations
|
||||
self._plot_category_correlation('Design', self.design_cols)
|
||||
self._plot_category_correlation('Entanglement', self.entanglement_cols)
|
||||
self._plot_category_correlation('Experience', self.experience_cols)
|
||||
|
||||
return corr_matrix
|
||||
|
||||
def _plot_category_correlation(self, category_name, columns):
|
||||
"""Plot correlation heatmap for a specific category."""
|
||||
corr = self.df[columns].corr()
|
||||
|
||||
plt.figure(figsize=(10, 8))
|
||||
sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r', center=0,
|
||||
square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'})
|
||||
plt.title(f'{category_name} Dimensions - Correlation Matrix')
|
||||
plt.tight_layout()
|
||||
self.save_plot(f'correlation_heatmap_{category_name.lower()}.png')
|
||||
|
||||
def network_analysis(self, threshold=0.5):
|
||||
"""Create network graph of protocol similarities."""
|
||||
print("\n=== Network Analysis ===")
|
||||
|
||||
# Compute pairwise distances
|
||||
distances = pdist(self.scaled_data, metric='euclidean')
|
||||
dist_matrix = squareform(distances)
|
||||
|
||||
# Convert to similarity (inverse of distance, normalized)
|
||||
max_dist = dist_matrix.max()
|
||||
similarity_matrix = 1 - (dist_matrix / max_dist)
|
||||
|
||||
# Create network
|
||||
G = nx.Graph()
|
||||
|
||||
# Add nodes
|
||||
for i, protocol in enumerate(self.df['Descriptor']):
|
||||
G.add_node(i, label=protocol)
|
||||
|
||||
# Add edges above threshold
|
||||
edge_count = 0
|
||||
for i in range(len(similarity_matrix)):
|
||||
for j in range(i+1, len(similarity_matrix)):
|
||||
if similarity_matrix[i, j] > threshold:
|
||||
G.add_edge(i, j, weight=similarity_matrix[i, j])
|
||||
edge_count += 1
|
||||
|
||||
print(f"Network with {G.number_of_nodes()} nodes and {edge_count} edges")
|
||||
|
||||
# Calculate network metrics
|
||||
if G.number_of_edges() > 0:
|
||||
degree_centrality = nx.degree_centrality(G)
|
||||
betweenness = nx.betweenness_centrality(G)
|
||||
|
||||
metrics_df = pd.DataFrame({
|
||||
'Descriptor': [self.df.iloc[i]['Descriptor'] for i in G.nodes()],
|
||||
'Degree_Centrality': [degree_centrality[i] for i in G.nodes()],
|
||||
'Betweenness_Centrality': [betweenness[i] for i in G.nodes()]
|
||||
}).sort_values('Degree_Centrality', ascending=False)
|
||||
|
||||
self.save_results(metrics_df, 'network_metrics.csv')
|
||||
|
||||
print("\nTop 5 most central protocols:")
|
||||
for _, row in metrics_df.head(5).iterrows():
|
||||
print(f" {row['Descriptor']}: {row['Degree_Centrality']:.3f}")
|
||||
|
||||
# Plot network
|
||||
plt.figure(figsize=(16, 16))
|
||||
pos = nx.spring_layout(G, k=0.5, iterations=50, seed=42)
|
||||
|
||||
# Node sizes based on degree centrality
|
||||
node_sizes = [degree_centrality[i] * 3000 + 100 for i in G.nodes()]
|
||||
|
||||
nx.draw_networkx_nodes(G, pos, node_size=node_sizes,
|
||||
node_color='lightblue', alpha=0.7)
|
||||
nx.draw_networkx_edges(G, pos, alpha=0.2)
|
||||
|
||||
# Labels for high-centrality nodes
|
||||
high_centrality = {i: self.df.iloc[i]['Descriptor']
|
||||
for i in G.nodes() if degree_centrality[i] > 0.1}
|
||||
nx.draw_networkx_labels(G, pos, labels=high_centrality, font_size=8)
|
||||
|
||||
plt.title(f'Protocol Similarity Network (threshold={threshold})')
|
||||
plt.axis('off')
|
||||
plt.tight_layout()
|
||||
self.save_plot('network_graph.png')
|
||||
else:
|
||||
print("No edges above threshold - try lowering the threshold")
|
||||
|
||||
return G
|
||||
|
||||
# ========== CLASSIFICATION & PREDICTION ==========
|
||||
|
||||
def category_discriminant_analysis(self):
|
||||
"""Analyze how well dimension categories discriminate protocols."""
|
||||
print("\n=== Category Discriminant Analysis ===")
|
||||
|
||||
results = []
|
||||
|
||||
for category_name, columns in [('Design', self.design_cols),
|
||||
('Entanglement', self.entanglement_cols),
|
||||
('Experience', self.experience_cols)]:
|
||||
|
||||
# Use one category to predict clustering from another
|
||||
X = self.df[columns].values
|
||||
|
||||
# Use kmeans clusters as target if available
|
||||
if 'kmeans_cluster' in self.df.columns:
|
||||
y = self.df['kmeans_cluster'].values
|
||||
|
||||
# LDA
|
||||
try:
|
||||
lda = LinearDiscriminantAnalysis()
|
||||
lda.fit(X, y)
|
||||
score = lda.score(X, y)
|
||||
|
||||
results.append({
|
||||
'Category': category_name,
|
||||
'Accuracy': score,
|
||||
'N_Dimensions': len(columns)
|
||||
})
|
||||
|
||||
print(f"{category_name} dimensions predict clusters with {score*100:.1f}% accuracy")
|
||||
except:
|
||||
print(f"Could not perform LDA for {category_name}")
|
||||
|
||||
if results:
|
||||
results_df = pd.DataFrame(results)
|
||||
self.save_results(results_df, 'category_discriminant_results.csv')
|
||||
|
||||
return results
|
||||
|
||||
def feature_importance_analysis(self):
|
||||
"""Analyze which dimensions are most important for clustering."""
|
||||
print("\n=== Feature Importance Analysis ===")
|
||||
|
||||
if 'kmeans_cluster' not in self.df.columns:
|
||||
print("Run clustering first to enable feature importance analysis")
|
||||
return None
|
||||
|
||||
# Random Forest classifier
|
||||
X = self.df[self.dimension_cols].values
|
||||
y = self.df['kmeans_cluster'].values
|
||||
|
||||
rf = RandomForestClassifier(n_estimators=100, random_state=42)
|
||||
rf.fit(X, y)
|
||||
|
||||
# Feature importances
|
||||
importances = pd.DataFrame({
|
||||
'Dimension': self.dimension_cols,
|
||||
'Importance': rf.feature_importances_
|
||||
}).sort_values('Importance', ascending=False)
|
||||
|
||||
self.save_results(importances, 'feature_importances.csv')
|
||||
|
||||
# Plot top 20
|
||||
plt.figure(figsize=(10, 12))
|
||||
top_20 = importances.head(20)
|
||||
plt.barh(range(len(top_20)), top_20['Importance'])
|
||||
plt.yticks(range(len(top_20)), top_20['Dimension'])
|
||||
plt.xlabel('Importance')
|
||||
plt.title('Top 20 Most Important Dimensions for Clustering')
|
||||
plt.gca().invert_yaxis()
|
||||
plt.tight_layout()
|
||||
self.save_plot('feature_importances.png')
|
||||
|
||||
print("\nTop 10 most important dimensions:")
|
||||
for _, row in importances.head(10).iterrows():
|
||||
print(f" {row['Dimension']}: {row['Importance']:.4f}")
|
||||
|
||||
return importances
|
||||
|
||||
def analyst_comparison(self):
|
||||
"""Compare ratings across different analysts."""
|
||||
print("\n=== Analyst Comparison ===")
|
||||
|
||||
if 'analyst' not in self.df.columns:
|
||||
print("No analyst column found")
|
||||
return None
|
||||
|
||||
analysts = self.df['analyst'].unique()
|
||||
print(f"Found {len(analysts)} unique analysts")
|
||||
|
||||
# Mean ratings by analyst for each dimension
|
||||
analyst_means = self.df.groupby('analyst')[self.dimension_cols].mean()
|
||||
self.save_results(analyst_means, 'analyst_mean_ratings.csv')
|
||||
|
||||
# Plot comparison
|
||||
fig, axes = plt.subplots(3, 1, figsize=(14, 12))
|
||||
|
||||
for idx, (category_name, columns) in enumerate([
|
||||
('Design', self.design_cols),
|
||||
('Entanglement', self.entanglement_cols),
|
||||
('Experience', self.experience_cols)
|
||||
]):
|
||||
analyst_means[columns].T.plot(ax=axes[idx], marker='o')
|
||||
axes[idx].set_title(f'{category_name} Dimensions - Mean Ratings by Analyst')
|
||||
axes[idx].set_ylabel('Mean Rating')
|
||||
axes[idx].legend(title='Analyst', bbox_to_anchor=(1.05, 1), loc='upper left')
|
||||
axes[idx].grid(True, alpha=0.3)
|
||||
axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=45, ha='right')
|
||||
|
||||
plt.tight_layout()
|
||||
self.save_plot('analyst_comparison.png')
|
||||
|
||||
return analyst_means
|
||||
|
||||
# ========== SUMMARY REPORT ==========
|
||||
|
||||
def generate_summary_report(self):
|
||||
"""Generate a text summary of all analyses."""
|
||||
print("\n=== Generating Summary Report ===")
|
||||
|
||||
report_lines = []
|
||||
report_lines.append("=" * 80)
|
||||
report_lines.append("MULTIVARIATE ANALYSIS SUMMARY REPORT")
|
||||
report_lines.append("Protocol Bicorder Dataset")
|
||||
report_lines.append("=" * 80)
|
||||
report_lines.append("")
|
||||
|
||||
report_lines.append(f"Dataset: {self.csv_path}")
|
||||
report_lines.append(f"Number of protocols: {len(self.df)}")
|
||||
report_lines.append(f"Number of dimensions: {len(self.dimension_cols)}")
|
||||
report_lines.append(f" - Design: {len(self.design_cols)}")
|
||||
report_lines.append(f" - Entanglement: {len(self.entanglement_cols)}")
|
||||
report_lines.append(f" - Experience: {len(self.experience_cols)}")
|
||||
report_lines.append("")
|
||||
|
||||
report_lines.append("-" * 80)
|
||||
report_lines.append("ANALYSES PERFORMED")
|
||||
report_lines.append("-" * 80)
|
||||
|
||||
# Check which analyses were run
|
||||
analyses_run = []
|
||||
|
||||
if 'kmeans_cluster' in self.df.columns:
|
||||
analyses_run.append("- K-Means Clustering")
|
||||
report_lines.append(f"K-Means: {len(self.df['kmeans_cluster'].unique())} clusters identified")
|
||||
|
||||
if 'hierarchical_cluster' in self.df.columns:
|
||||
analyses_run.append("- Hierarchical Clustering")
|
||||
report_lines.append(f"Hierarchical: {len(self.df['hierarchical_cluster'].unique())} clusters")
|
||||
|
||||
if 'dbscan_cluster' in self.df.columns:
|
||||
analyses_run.append("- DBSCAN Clustering")
|
||||
n_outliers = (self.df['dbscan_cluster'] == -1).sum()
|
||||
report_lines.append(f"DBSCAN: {n_outliers} outlier protocols identified")
|
||||
|
||||
report_lines.append("")
|
||||
report_lines.append("Dimensionality Reduction:")
|
||||
report_lines.append("- Principal Component Analysis (PCA)")
|
||||
report_lines.append("- t-SNE Projection")
|
||||
if UMAP_AVAILABLE:
|
||||
report_lines.append("- UMAP Projection")
|
||||
report_lines.append("- Factor Analysis")
|
||||
report_lines.append("")
|
||||
|
||||
report_lines.append("Statistical Analyses:")
|
||||
report_lines.append("- Correlation Analysis")
|
||||
report_lines.append("- Network Analysis")
|
||||
report_lines.append("- Feature Importance Analysis")
|
||||
|
||||
if 'analyst' in self.df.columns:
|
||||
report_lines.append("- Analyst Comparison")
|
||||
|
||||
report_lines.append("")
|
||||
report_lines.append("-" * 80)
|
||||
report_lines.append("OUTPUT FILES")
|
||||
report_lines.append("-" * 80)
|
||||
report_lines.append(f"All results saved to: {self.output_dir}/")
|
||||
report_lines.append(" - plots/ : All visualizations (PNG)")
|
||||
report_lines.append(" - data/ : All numerical results (CSV)")
|
||||
report_lines.append(" - reports/ : This summary report")
|
||||
report_lines.append("")
|
||||
|
||||
report_lines.append("=" * 80)
|
||||
report_lines.append("END OF REPORT")
|
||||
report_lines.append("=" * 80)
|
||||
|
||||
report_text = "\n".join(report_lines)
|
||||
|
||||
# Save report
|
||||
report_path = self.output_dir / 'reports' / 'analysis_summary.txt'
|
||||
with open(report_path, 'w') as f:
|
||||
f.write(report_text)
|
||||
|
||||
print(f" Saved: {report_path}")
|
||||
print("\n" + report_text)
|
||||
|
||||
return report_text
|
||||
|
||||
|
||||
def main():
|
||||
"""Main execution function."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Multivariate analysis of Protocol Bicorder data',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv
|
||||
python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv --output data/readings/synthetic_20251116/analysis
|
||||
python3 scripts/multivariate_analysis.py data/readings/synthetic_20251116/readings.csv --analyses clustering pca
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('csv_file', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
|
||||
parser.add_argument('--output', '-o', default=None,
|
||||
help='Output directory (default: <dataset_dir>/analysis)')
|
||||
parser.add_argument('--min-coverage', type=float, default=0.0,
|
||||
help='Drop dimension columns below this coverage fraction (0.0–1.0). '
|
||||
'E.g. 0.8 keeps only columns ≥80%% complete. '
|
||||
'Useful for sparse/shortform datasets (default: 0.0, keep all)')
|
||||
parser.add_argument('--analyses', nargs='+',
|
||||
choices=['clustering', 'pca', 'tsne', 'umap', 'factor',
|
||||
'correlation', 'network', 'importance', 'analyst', 'all'],
|
||||
default=['all'],
|
||||
help='Which analyses to run (default: all)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Check if file exists
|
||||
if not Path(args.csv_file).exists():
|
||||
print(f"Error: File not found: {args.csv_file}")
|
||||
sys.exit(1)
|
||||
|
||||
# Derive output dir from dataset dir if not specified
|
||||
output_dir = args.output if args.output else str(Path(args.csv_file).parent / 'analysis')
|
||||
|
||||
# Initialize analyzer
|
||||
print("=" * 80)
|
||||
print("PROTOCOL BICORDER - MULTIVARIATE ANALYSIS")
|
||||
print("=" * 80)
|
||||
|
||||
analyzer = ProtocolAnalyzer(args.csv_file, output_dir, min_coverage=args.min_coverage)
|
||||
|
||||
# Determine which analyses to run
|
||||
run_all = 'all' in args.analyses
|
||||
|
||||
# Run analyses
|
||||
try:
|
||||
# Clustering
|
||||
if run_all or 'clustering' in args.analyses:
|
||||
analyzer.kmeans_clustering()
|
||||
analyzer.hierarchical_clustering()
|
||||
analyzer.dbscan_clustering()
|
||||
|
||||
# Dimensionality reduction
|
||||
if run_all or 'pca' in args.analyses:
|
||||
analyzer.pca_analysis()
|
||||
|
||||
if run_all or 'tsne' in args.analyses:
|
||||
analyzer.tsne_analysis()
|
||||
|
||||
if run_all or 'umap' in args.analyses:
|
||||
analyzer.umap_analysis()
|
||||
|
||||
if run_all or 'factor' in args.analyses:
|
||||
analyzer.factor_analysis()
|
||||
|
||||
# Correlation and structure
|
||||
if run_all or 'correlation' in args.analyses:
|
||||
analyzer.correlation_analysis()
|
||||
|
||||
if run_all or 'network' in args.analyses:
|
||||
analyzer.network_analysis(threshold=0.6)
|
||||
|
||||
# Classification
|
||||
if run_all or 'importance' in args.analyses:
|
||||
analyzer.category_discriminant_analysis()
|
||||
analyzer.feature_importance_analysis()
|
||||
|
||||
if run_all or 'analyst' in args.analyses:
|
||||
analyzer.analyst_comparison()
|
||||
|
||||
# Generate summary
|
||||
analyzer.generate_summary_report()
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("ANALYSIS COMPLETE!")
|
||||
print("=" * 80)
|
||||
print(f"\nAll results saved to: {analyzer.output_dir}/")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\nError during analysis: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
206
analysis/scripts/review_analysis.py
Normal file
206
analysis/scripts/review_analysis.py
Normal file
@@ -0,0 +1,206 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive review of the analysis for errors and inconsistencies.
|
||||
|
||||
Usage:
|
||||
python3 scripts/review_analysis.py data/readings/synthetic_20251116.csv
|
||||
python3 scripts/review_analysis.py data/readings/manual_20260101.csv --results-dir analysis_results/manual_20260101
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Check analysis results for errors and inconsistencies',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Example usage:
|
||||
python3 scripts/review_analysis.py data/readings/synthetic_20251116/readings.csv
|
||||
python3 scripts/review_analysis.py data/readings/manual_20260101/readings.csv --analysis-dir data/readings/manual_20260101/analysis
|
||||
"""
|
||||
)
|
||||
parser.add_argument('input_csv', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
|
||||
parser.add_argument('--analysis-dir', default=None,
|
||||
help='Analysis directory (default: <dataset_dir>/analysis)')
|
||||
args = parser.parse_args()
|
||||
|
||||
dataset_dir = Path(args.input_csv).parent
|
||||
results_dir = Path(args.analysis_dir) if args.analysis_dir else dataset_dir / 'analysis'
|
||||
|
||||
print("=" * 80)
|
||||
print("ANALYSIS REVIEW - ERROR CHECKING")
|
||||
print("=" * 80)
|
||||
print(f"Dataset: {args.input_csv}")
|
||||
print(f"Results: {results_dir}")
|
||||
|
||||
# Load data
|
||||
df = pd.read_csv(args.input_csv)
|
||||
clusters = pd.read_csv(results_dir / 'data' / 'kmeans_clusters.csv')
|
||||
pca_coords = pd.read_csv(results_dir / 'data' / 'pca_coordinates.csv')
|
||||
|
||||
# Identify dimension columns
|
||||
design_cols = [c for c in df.columns if c.startswith('Design_')]
|
||||
entanglement_cols = [c for c in df.columns if c.startswith('Entanglement_')]
|
||||
experience_cols = [c for c in df.columns if c.startswith('Experience_')]
|
||||
dimension_cols = design_cols + entanglement_cols + experience_cols
|
||||
|
||||
errors_found = []
|
||||
warnings_found = []
|
||||
|
||||
print("\n1. DATA COMPLETENESS CHECK")
|
||||
print("-" * 80)
|
||||
|
||||
missing_count = df[dimension_cols].isna().sum().sum()
|
||||
rows_with_missing = df[dimension_cols].isna().any(axis=1).sum()
|
||||
|
||||
print(f"✓ Total protocols in source data: {len(df)}")
|
||||
print(f"✓ Protocols with complete data: {len(df) - rows_with_missing}")
|
||||
print(f"✓ Protocols with missing values: {rows_with_missing}")
|
||||
print(f"✓ Protocols in cluster analysis: {len(clusters)}")
|
||||
|
||||
if rows_with_missing > 0:
|
||||
warnings_found.append(f"{rows_with_missing} protocols excluded due to missing values")
|
||||
missing_protocols = df[df[dimension_cols].isna().any(axis=1)]['Descriptor'].tolist()
|
||||
print(f"\n Excluded protocols: {', '.join(missing_protocols)}")
|
||||
|
||||
merged = df.merge(clusters, on='Descriptor', how='inner')
|
||||
if len(merged) != len(clusters):
|
||||
errors_found.append(f"Descriptor mismatch: {len(merged)} matched vs {len(clusters)} expected")
|
||||
else:
|
||||
print(f"✓ All cluster descriptors match source data")
|
||||
|
||||
print("\n2. DATA QUALITY CHECK")
|
||||
print("-" * 80)
|
||||
|
||||
for col in dimension_cols:
|
||||
values = df[col].dropna()
|
||||
if values.min() < 1 or values.max() > 9:
|
||||
errors_found.append(f"Column {col} has out-of-range values: [{values.min()}, {values.max()}]")
|
||||
|
||||
print(f"✓ All dimension values within expected range [1, 9]")
|
||||
|
||||
df_clean = df.dropna(subset=dimension_cols)
|
||||
variances = df_clean[dimension_cols].var()
|
||||
low_var_dims = variances[variances < 1.0]
|
||||
if len(low_var_dims) > 0:
|
||||
warnings_found.append(f"{len(low_var_dims)} dimensions have very low variance (< 1.0)")
|
||||
print(f"\n Low variance dimensions:")
|
||||
for dim, var in low_var_dims.items():
|
||||
print(f" - {dim}: {var:.3f}")
|
||||
else:
|
||||
print(f"✓ All dimensions have reasonable variance")
|
||||
|
||||
print("\n3. CLUSTERING VALIDATION")
|
||||
print("-" * 80)
|
||||
|
||||
cluster_sizes = clusters['cluster'].value_counts().sort_index()
|
||||
print(f"✓ Cluster 1: {cluster_sizes[1]} protocols ({cluster_sizes[1]/len(clusters)*100:.1f}%)")
|
||||
print(f"✓ Cluster 2: {cluster_sizes[2]} protocols ({cluster_sizes[2]/len(clusters)*100:.1f}%)")
|
||||
|
||||
imbalance_ratio = max(cluster_sizes) / min(cluster_sizes)
|
||||
if imbalance_ratio > 2.0:
|
||||
warnings_found.append(f"Cluster imbalance ratio is {imbalance_ratio:.2f} (ideally < 2.0)")
|
||||
|
||||
if len(pca_coords) != len(clusters):
|
||||
errors_found.append(f"PCA coordinates count ({len(pca_coords)}) != cluster count ({len(clusters)})")
|
||||
else:
|
||||
print(f"✓ PCA coordinates match cluster count")
|
||||
|
||||
pca_loadings = pd.read_csv(results_dir / 'data' / 'pca_loadings.csv', index_col=0)
|
||||
if pca_loadings.shape[0] != 23:
|
||||
errors_found.append(f"PCA loadings have {pca_loadings.shape[0]} rows, expected 23")
|
||||
else:
|
||||
print(f"✓ PCA loadings have correct dimensions")
|
||||
|
||||
print("\n4. STATISTICAL VALIDITY")
|
||||
print("-" * 80)
|
||||
|
||||
corr_matrix = pd.read_csv(results_dir / 'data' / 'correlation_matrix.csv', index_col=0)
|
||||
|
||||
np.fill_diagonal(corr_matrix.values, 0)
|
||||
perfect_corrs = np.where(np.abs(corr_matrix.values) > 0.99)
|
||||
if len(perfect_corrs[0]) > 0:
|
||||
warnings_found.append(f"Found {len(perfect_corrs[0])} near-perfect correlations between dimensions")
|
||||
else:
|
||||
print(f"✓ No perfect correlations found (multicollinearity check)")
|
||||
|
||||
try:
|
||||
if corr_matrix.shape[0] == corr_matrix.shape[1]:
|
||||
if not np.allclose(corr_matrix.values, corr_matrix.values.T, equal_nan=True):
|
||||
errors_found.append("Correlation matrix is not symmetric")
|
||||
else:
|
||||
print(f"✓ Correlation matrix is symmetric")
|
||||
else:
|
||||
errors_found.append(f"Correlation matrix is not square: {corr_matrix.shape}")
|
||||
except Exception as e:
|
||||
warnings_found.append(f"Could not verify correlation matrix symmetry: {e}")
|
||||
|
||||
print("\n5. AVERAGE VALUES CHECK")
|
||||
print("-" * 80)
|
||||
|
||||
df_clean = df.dropna(subset=dimension_cols)
|
||||
calculated_averages = df_clean[dimension_cols].mean(axis=1)
|
||||
print(f"✓ Average values range: [{calculated_averages.min():.2f}, {calculated_averages.max():.2f}]")
|
||||
print(f"✓ Mean of averages: {calculated_averages.mean():.2f}")
|
||||
print(f"✓ Std of averages: {calculated_averages.std():.2f}")
|
||||
|
||||
from scipy import stats
|
||||
bins = np.arange(int(calculated_averages.min()), int(calculated_averages.max()) + 1, 0.5)
|
||||
observed_counts, _ = np.histogram(calculated_averages, bins=bins)
|
||||
expected_count = len(calculated_averages) / len(bins[:-1])
|
||||
chi2_stat = np.sum((observed_counts - expected_count)**2 / expected_count)
|
||||
p_value = 1 - stats.chi2.cdf(chi2_stat, len(bins) - 2)
|
||||
|
||||
print(f"✓ Distribution uniformity test p-value: {p_value:.4f}")
|
||||
if p_value < 0.05:
|
||||
print(f" (Distribution is significantly non-uniform, as expected for real data)")
|
||||
else:
|
||||
warnings_found.append("Average values may be too uniformly distributed (p > 0.05)")
|
||||
|
||||
print("\n6. CLUSTER SEPARATION CHECK")
|
||||
print("-" * 80)
|
||||
|
||||
merged = df_clean.merge(clusters, on='Descriptor')
|
||||
cluster1_means = merged[merged['cluster'] == 1][dimension_cols].mean()
|
||||
cluster2_means = merged[merged['cluster'] == 2][dimension_cols].mean()
|
||||
|
||||
differences = (cluster1_means - cluster2_means).abs()
|
||||
significant_diffs = differences[differences > 0.5]
|
||||
print(f"✓ Dimensions with meaningful difference (>0.5) between clusters: {len(significant_diffs)}/23")
|
||||
|
||||
if len(significant_diffs) < 5:
|
||||
warnings_found.append(f"Only {len(significant_diffs)} dimensions show meaningful separation between clusters")
|
||||
|
||||
print(f"\n Top 5 differentiating dimensions:")
|
||||
for dim in differences.nlargest(5).index:
|
||||
print(f" - {dim}: {differences[dim]:.3f}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("SUMMARY")
|
||||
print("=" * 80)
|
||||
|
||||
if len(errors_found) == 0:
|
||||
print("✓ No critical errors found!")
|
||||
else:
|
||||
print(f"✗ {len(errors_found)} CRITICAL ERROR(S) FOUND:")
|
||||
for i, error in enumerate(errors_found, 1):
|
||||
print(f" {i}. {error}")
|
||||
|
||||
if len(warnings_found) == 0:
|
||||
print("✓ No warnings!")
|
||||
else:
|
||||
print(f"\n⚠ {len(warnings_found)} WARNING(S):")
|
||||
for i, warning in enumerate(warnings_found, 1):
|
||||
print(f" {i}. {warning}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("REVIEW COMPLETE")
|
||||
print("=" * 80)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
107
analysis/scripts/sync_readings.sh
Executable file
107
analysis/scripts/sync_readings.sh
Executable file
@@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env bash
|
||||
# Sync a readings dataset from a remote git repository, then regenerate CSV and analysis.
|
||||
#
|
||||
# Reads remote URL and subdirectory from a .sync_source file in the dataset directory.
|
||||
#
|
||||
# Usage:
|
||||
# scripts/sync_readings.sh data/readings/manual_20260320
|
||||
# scripts/sync_readings.sh data/readings/manual_20260320 --no-analysis
|
||||
# scripts/sync_readings.sh data/readings/manual_20260320 --min-coverage 0.8
|
||||
# scripts/sync_readings.sh data/readings/manual_20260320 --training data/readings/synthetic_20251116/readings.csv
|
||||
#
|
||||
# .sync_source format:
|
||||
# REMOTE_URL=https://git.example.org/user/repo
|
||||
# REMOTE_SUBDIR=readings
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
DATASET_DIR="${1:?Usage: $0 <dataset_dir> [--no-analysis] [--min-coverage N]}"
|
||||
RUN_ANALYSIS=true
|
||||
MIN_COVERAGE=0.8
|
||||
TRAINING_CSV="data/readings/synthetic_20251116/readings.csv"
|
||||
|
||||
shift || true
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--no-analysis) RUN_ANALYSIS=false ;;
|
||||
--min-coverage) MIN_COVERAGE="$2"; shift ;;
|
||||
--training) TRAINING_CSV="$2"; shift ;;
|
||||
*) echo "Unknown option: $1" >&2; exit 1 ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
SYNC_SOURCE="$DATASET_DIR/.sync_source"
|
||||
if [[ ! -f "$SYNC_SOURCE" ]]; then
|
||||
echo "Error: $SYNC_SOURCE not found. Create it with REMOTE_URL and REMOTE_SUBDIR." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Load config
|
||||
REMOTE_URL=$(grep '^REMOTE_URL=' "$SYNC_SOURCE" | cut -d= -f2-)
|
||||
REMOTE_SUBDIR=$(grep '^REMOTE_SUBDIR=' "$SYNC_SOURCE" | cut -d= -f2-)
|
||||
|
||||
if [[ -z "$REMOTE_URL" ]]; then
|
||||
echo "Error: REMOTE_URL not set in $SYNC_SOURCE" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REMOTE_SUBDIR="${REMOTE_SUBDIR:-readings}"
|
||||
JSON_DIR="$DATASET_DIR/json"
|
||||
|
||||
echo "========================================"
|
||||
echo "Syncing: $DATASET_DIR"
|
||||
echo "From: $REMOTE_URL/$REMOTE_SUBDIR"
|
||||
echo "========================================"
|
||||
|
||||
# Clone remote to temp dir and copy JSON files
|
||||
TMPDIR=$(mktemp -d)
|
||||
trap "rm -rf '$TMPDIR'" EXIT
|
||||
|
||||
echo ""
|
||||
echo "Fetching remote data..."
|
||||
git clone --depth 1 --quiet "$REMOTE_URL" "$TMPDIR"
|
||||
|
||||
SRC="$TMPDIR/$REMOTE_SUBDIR"
|
||||
if [[ ! -d "$SRC" ]]; then
|
||||
echo "Error: subdirectory '$REMOTE_SUBDIR' not found in remote repo." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
NEW=$(find "$SRC" -name '*.json' | wc -l | tr -d ' ')
|
||||
mkdir -p "$JSON_DIR"
|
||||
cp "$SRC"/*.json "$JSON_DIR"/
|
||||
echo "Copied $NEW JSON files → $JSON_DIR"
|
||||
|
||||
# Determine VENV python
|
||||
PYTHON=python3
|
||||
if [[ -f ".venv/bin/python3" ]]; then
|
||||
PYTHON=".venv/bin/python3"
|
||||
fi
|
||||
|
||||
# Regenerate CSV
|
||||
echo ""
|
||||
echo "Regenerating readings.csv..."
|
||||
"$PYTHON" scripts/json_to_csv.py "$JSON_DIR" -o "$DATASET_DIR/readings.csv"
|
||||
|
||||
if [[ "$RUN_ANALYSIS" == true ]]; then
|
||||
echo ""
|
||||
echo "Running multivariate analysis (--min-coverage $MIN_COVERAGE)..."
|
||||
"$PYTHON" scripts/multivariate_analysis.py \
|
||||
"$DATASET_DIR/readings.csv" \
|
||||
--min-coverage "$MIN_COVERAGE" \
|
||||
--analyses clustering pca correlation importance
|
||||
|
||||
echo ""
|
||||
echo "Generating LDA visualization..."
|
||||
"$PYTHON" scripts/lda_visualization.py "$DATASET_DIR/readings.csv"
|
||||
|
||||
echo ""
|
||||
echo "Classifying readings (training: $TRAINING_CSV)..."
|
||||
"$PYTHON" scripts/classify_readings.py \
|
||||
"$DATASET_DIR/readings.csv" \
|
||||
--training "$TRAINING_CSV"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "Done. Dataset: $DATASET_DIR"
|
||||
167
analysis/scripts/visualize_clusters.py
Normal file
167
analysis/scripts/visualize_clusters.py
Normal file
@@ -0,0 +1,167 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Create visualizations of k-means clusters overlaid on dimensionality reduction plots.
|
||||
|
||||
Usage:
|
||||
python3 scripts/visualize_clusters.py data/readings/synthetic_20251116.csv
|
||||
python3 scripts/visualize_clusters.py data/readings/manual_20260101.csv --results-dir analysis_results/manual_20260101
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Visualize k-means clusters in PCA/t-SNE/UMAP space',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Example usage:
|
||||
python3 scripts/visualize_clusters.py data/readings/synthetic_20251116/readings.csv
|
||||
python3 scripts/visualize_clusters.py data/readings/manual_20260101/readings.csv --analysis-dir data/readings/manual_20260101/analysis
|
||||
"""
|
||||
)
|
||||
parser.add_argument('input_csv', help='Diagnostic readings CSV (e.g. data/readings/synthetic_20251116/readings.csv)')
|
||||
parser.add_argument('--analysis-dir', default=None,
|
||||
help='Analysis directory (default: <dataset_dir>/analysis)')
|
||||
args = parser.parse_args()
|
||||
|
||||
dataset_dir = Path(args.input_csv).parent
|
||||
results_dir = Path(args.analysis_dir) if args.analysis_dir else dataset_dir / 'analysis'
|
||||
plots_dir = results_dir / 'plots'
|
||||
data_dir = results_dir / 'data'
|
||||
|
||||
# Load cluster assignments
|
||||
clusters = pd.read_csv(data_dir / 'kmeans_clusters.csv')
|
||||
clusters['cluster'] = clusters['cluster'] # Already 1-indexed
|
||||
|
||||
# Load dimensionality reduction coordinates
|
||||
pca_coords = pd.read_csv(data_dir / 'pca_coordinates.csv')
|
||||
tsne_coords = pd.read_csv(data_dir / 'tsne_coordinates.csv')
|
||||
|
||||
# Merge cluster assignments with coordinates
|
||||
pca_data = pca_coords.merge(clusters, on='Descriptor')
|
||||
tsne_data = tsne_coords.merge(clusters, on='Descriptor')
|
||||
|
||||
# Set up color scheme
|
||||
colors = {1: '#2E86AB', 2: '#A23B72'} # Blue for cluster 1, Purple for cluster 2
|
||||
cluster_names = {1: 'Cluster 1: Relational/Cultural', 2: 'Cluster 2: Institutional/Bureaucratic'}
|
||||
|
||||
# ========== PCA Plot with Clusters ==========
|
||||
print("Creating PCA plot with cluster colors...")
|
||||
fig, ax = plt.subplots(figsize=(14, 12))
|
||||
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = pca_data[pca_data['cluster'] == cluster_id]
|
||||
ax.scatter(cluster_data['PC1'], cluster_data['PC2'],
|
||||
c=colors[cluster_id], label=cluster_names[cluster_id],
|
||||
alpha=0.6, s=60, edgecolors='white', linewidth=0.5)
|
||||
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = pca_data[pca_data['cluster'] == cluster_id]
|
||||
for i, row in cluster_data.iterrows():
|
||||
if i % 8 == 0:
|
||||
ax.annotate(row['Descriptor'],
|
||||
(row['PC1'], row['PC2']),
|
||||
fontsize=7, alpha=0.7,
|
||||
xytext=(5, 5), textcoords='offset points')
|
||||
|
||||
ax.set_xlabel('PC1 (22.5% variance)', fontsize=12)
|
||||
ax.set_ylabel('PC2 (22.7% variance)', fontsize=12)
|
||||
ax.set_title('K-Means Clusters in PCA Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold')
|
||||
ax.legend(loc='best', fontsize=10, framealpha=0.9)
|
||||
ax.grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(plots_dir / 'pca_2d_clustered.png', dpi=300, bbox_inches='tight')
|
||||
print(f" Saved: {plots_dir / 'pca_2d_clustered.png'}")
|
||||
plt.close()
|
||||
|
||||
# ========== t-SNE Plot with Clusters ==========
|
||||
print("Creating t-SNE plot with cluster colors...")
|
||||
fig, ax = plt.subplots(figsize=(14, 12))
|
||||
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = tsne_data[tsne_data['cluster'] == cluster_id]
|
||||
ax.scatter(cluster_data['TSNE1'], cluster_data['TSNE2'],
|
||||
c=colors[cluster_id], label=cluster_names[cluster_id],
|
||||
alpha=0.6, s=60, edgecolors='white', linewidth=0.5)
|
||||
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = tsne_data[tsne_data['cluster'] == cluster_id]
|
||||
for i, row in cluster_data.iterrows():
|
||||
if i % 8 == 0:
|
||||
ax.annotate(row['Descriptor'],
|
||||
(row['TSNE1'], row['TSNE2']),
|
||||
fontsize=7, alpha=0.7,
|
||||
xytext=(5, 5), textcoords='offset points')
|
||||
|
||||
ax.set_xlabel('t-SNE Dimension 1', fontsize=12)
|
||||
ax.set_ylabel('t-SNE Dimension 2', fontsize=12)
|
||||
ax.set_title('K-Means Clusters in t-SNE Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold')
|
||||
ax.legend(loc='best', fontsize=10, framealpha=0.9)
|
||||
ax.grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(plots_dir / 'tsne_2d_clustered.png', dpi=300, bbox_inches='tight')
|
||||
print(f" Saved: {plots_dir / 'tsne_2d_clustered.png'}")
|
||||
plt.close()
|
||||
|
||||
# ========== UMAP Plot with Clusters (if available) ==========
|
||||
umap_path = data_dir / 'umap_coordinates.csv'
|
||||
if umap_path.exists():
|
||||
print("Creating UMAP plot with cluster colors...")
|
||||
umap_coords = pd.read_csv(umap_path)
|
||||
umap_data = umap_coords.merge(clusters, on='Descriptor')
|
||||
|
||||
fig, ax = plt.subplots(figsize=(14, 12))
|
||||
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = umap_data[umap_data['cluster'] == cluster_id]
|
||||
ax.scatter(cluster_data['UMAP1'], cluster_data['UMAP2'],
|
||||
c=colors[cluster_id], label=cluster_names[cluster_id],
|
||||
alpha=0.6, s=60, edgecolors='white', linewidth=0.5)
|
||||
|
||||
for cluster_id in [1, 2]:
|
||||
cluster_data = umap_data[umap_data['cluster'] == cluster_id]
|
||||
for i, row in cluster_data.iterrows():
|
||||
if i % 8 == 0:
|
||||
ax.annotate(row['Descriptor'],
|
||||
(row['UMAP1'], row['UMAP2']),
|
||||
fontsize=7, alpha=0.7,
|
||||
xytext=(5, 5), textcoords='offset points')
|
||||
|
||||
ax.set_xlabel('UMAP Dimension 1', fontsize=12)
|
||||
ax.set_ylabel('UMAP Dimension 2', fontsize=12)
|
||||
ax.set_title('K-Means Clusters in UMAP Space\nTwo Distinct Protocol Families', fontsize=14, fontweight='bold')
|
||||
ax.legend(loc='best', fontsize=10, framealpha=0.9)
|
||||
ax.grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(plots_dir / 'umap_2d_clustered.png', dpi=300, bbox_inches='tight')
|
||||
print(f" Saved: {plots_dir / 'umap_2d_clustered.png'}")
|
||||
plt.close()
|
||||
|
||||
# ========== Summary Statistics ==========
|
||||
print("\n=== Cluster Summary ===")
|
||||
print(f"Total protocols: {len(clusters)}")
|
||||
print(f"\nCluster 1 (Relational/Cultural): {len(clusters[clusters['cluster'] == 1])} protocols")
|
||||
print(f"Cluster 2 (Institutional/Bureaucratic): {len(clusters[clusters['cluster'] == 2])} protocols")
|
||||
|
||||
print("\nSample protocols from each cluster:")
|
||||
print("\nCluster 1 (Relational/Cultural):")
|
||||
for protocol in clusters[clusters['cluster'] == 1]['Descriptor'].head(10):
|
||||
print(f" - {protocol}")
|
||||
|
||||
print("\nCluster 2 (Institutional/Bureaucratic):")
|
||||
for protocol in clusters[clusters['cluster'] == 2]['Descriptor'].head(10):
|
||||
print(f" - {protocol}")
|
||||
|
||||
print("\n=== Visualization Complete! ===")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user