Files
protocol-bicorder/analysis/scripts/sync_readings.sh
Nathan Schneider 897c30406b Reorganize directory, add manual dataset and sync tooling
- Move all scripts to scripts/, web assets to web/, analysis results
  into self-contained data/readings/<type>_<YYYYMMDD>/ directories
- Add data/readings/manual_20260320/ with 32 JSON readings from
  git.medlab.host/ntnsndr/protocol-bicorder-data
- Add scripts/json_to_csv.py to convert bicorder JSON files to CSV
- Add scripts/sync_readings.sh for one-command sync + re-analysis of
  any dataset backed by a .sync_source config file
- Add scripts/classify_readings.py to apply the LDA classifier to all
  readings and save per-reading cluster assignments
- Add --min-coverage flag to multivariate_analysis.py for sparse/shortform
  datasets; also applies in lda_visualization.py
- Fix lda_visualization.py NaN handling and 0-d array annotation bug
- Update README.md and WORKFLOW.md to document datasets, sync workflow,
  shortform handling, and new scripts

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-20 17:35:13 -06:00

108 lines
3.1 KiB
Bash
Executable File

#!/usr/bin/env bash
# Sync a readings dataset from a remote git repository, then regenerate CSV and analysis.
#
# Reads remote URL and subdirectory from a .sync_source file in the dataset directory.
#
# Usage:
# scripts/sync_readings.sh data/readings/manual_20260320
# scripts/sync_readings.sh data/readings/manual_20260320 --no-analysis
# scripts/sync_readings.sh data/readings/manual_20260320 --min-coverage 0.8
# scripts/sync_readings.sh data/readings/manual_20260320 --training data/readings/synthetic_20251116/readings.csv
#
# .sync_source format:
# REMOTE_URL=https://git.example.org/user/repo
# REMOTE_SUBDIR=readings
set -euo pipefail
DATASET_DIR="${1:?Usage: $0 <dataset_dir> [--no-analysis] [--min-coverage N]}"
RUN_ANALYSIS=true
MIN_COVERAGE=0.8
TRAINING_CSV="data/readings/synthetic_20251116/readings.csv"
shift || true
while [[ $# -gt 0 ]]; do
case "$1" in
--no-analysis) RUN_ANALYSIS=false ;;
--min-coverage) MIN_COVERAGE="$2"; shift ;;
--training) TRAINING_CSV="$2"; shift ;;
*) echo "Unknown option: $1" >&2; exit 1 ;;
esac
shift
done
SYNC_SOURCE="$DATASET_DIR/.sync_source"
if [[ ! -f "$SYNC_SOURCE" ]]; then
echo "Error: $SYNC_SOURCE not found. Create it with REMOTE_URL and REMOTE_SUBDIR." >&2
exit 1
fi
# Load config
REMOTE_URL=$(grep '^REMOTE_URL=' "$SYNC_SOURCE" | cut -d= -f2-)
REMOTE_SUBDIR=$(grep '^REMOTE_SUBDIR=' "$SYNC_SOURCE" | cut -d= -f2-)
if [[ -z "$REMOTE_URL" ]]; then
echo "Error: REMOTE_URL not set in $SYNC_SOURCE" >&2
exit 1
fi
REMOTE_SUBDIR="${REMOTE_SUBDIR:-readings}"
JSON_DIR="$DATASET_DIR/json"
echo "========================================"
echo "Syncing: $DATASET_DIR"
echo "From: $REMOTE_URL/$REMOTE_SUBDIR"
echo "========================================"
# Clone remote to temp dir and copy JSON files
TMPDIR=$(mktemp -d)
trap "rm -rf '$TMPDIR'" EXIT
echo ""
echo "Fetching remote data..."
git clone --depth 1 --quiet "$REMOTE_URL" "$TMPDIR"
SRC="$TMPDIR/$REMOTE_SUBDIR"
if [[ ! -d "$SRC" ]]; then
echo "Error: subdirectory '$REMOTE_SUBDIR' not found in remote repo." >&2
exit 1
fi
NEW=$(find "$SRC" -name '*.json' | wc -l | tr -d ' ')
mkdir -p "$JSON_DIR"
cp "$SRC"/*.json "$JSON_DIR"/
echo "Copied $NEW JSON files → $JSON_DIR"
# Determine VENV python
PYTHON=python3
if [[ -f ".venv/bin/python3" ]]; then
PYTHON=".venv/bin/python3"
fi
# Regenerate CSV
echo ""
echo "Regenerating readings.csv..."
"$PYTHON" scripts/json_to_csv.py "$JSON_DIR" -o "$DATASET_DIR/readings.csv"
if [[ "$RUN_ANALYSIS" == true ]]; then
echo ""
echo "Running multivariate analysis (--min-coverage $MIN_COVERAGE)..."
"$PYTHON" scripts/multivariate_analysis.py \
"$DATASET_DIR/readings.csv" \
--min-coverage "$MIN_COVERAGE" \
--analyses clustering pca correlation importance
echo ""
echo "Generating LDA visualization..."
"$PYTHON" scripts/lda_visualization.py "$DATASET_DIR/readings.csv"
echo ""
echo "Classifying readings (training: $TRAINING_CSV)..."
"$PYTHON" scripts/classify_readings.py \
"$DATASET_DIR/readings.csv" \
--training "$TRAINING_CSV"
fi
echo ""
echo "Done. Dataset: $DATASET_DIR"