- Move all scripts to scripts/, web assets to web/, analysis results into self-contained data/readings/<type>_<YYYYMMDD>/ directories - Add data/readings/manual_20260320/ with 32 JSON readings from git.medlab.host/ntnsndr/protocol-bicorder-data - Add scripts/json_to_csv.py to convert bicorder JSON files to CSV - Add scripts/sync_readings.sh for one-command sync + re-analysis of any dataset backed by a .sync_source config file - Add scripts/classify_readings.py to apply the LDA classifier to all readings and save per-reading cluster assignments - Add --min-coverage flag to multivariate_analysis.py for sparse/shortform datasets; also applies in lda_visualization.py - Fix lda_visualization.py NaN handling and 0-d array annotation bug - Update README.md and WORKFLOW.md to document datasets, sync workflow, shortform handling, and new scripts Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
108 lines
3.1 KiB
Bash
Executable File
108 lines
3.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Sync a readings dataset from a remote git repository, then regenerate CSV and analysis.
|
|
#
|
|
# Reads remote URL and subdirectory from a .sync_source file in the dataset directory.
|
|
#
|
|
# Usage:
|
|
# scripts/sync_readings.sh data/readings/manual_20260320
|
|
# scripts/sync_readings.sh data/readings/manual_20260320 --no-analysis
|
|
# scripts/sync_readings.sh data/readings/manual_20260320 --min-coverage 0.8
|
|
# scripts/sync_readings.sh data/readings/manual_20260320 --training data/readings/synthetic_20251116/readings.csv
|
|
#
|
|
# .sync_source format:
|
|
# REMOTE_URL=https://git.example.org/user/repo
|
|
# REMOTE_SUBDIR=readings
|
|
|
|
set -euo pipefail
|
|
|
|
DATASET_DIR="${1:?Usage: $0 <dataset_dir> [--no-analysis] [--min-coverage N]}"
|
|
RUN_ANALYSIS=true
|
|
MIN_COVERAGE=0.8
|
|
TRAINING_CSV="data/readings/synthetic_20251116/readings.csv"
|
|
|
|
shift || true
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--no-analysis) RUN_ANALYSIS=false ;;
|
|
--min-coverage) MIN_COVERAGE="$2"; shift ;;
|
|
--training) TRAINING_CSV="$2"; shift ;;
|
|
*) echo "Unknown option: $1" >&2; exit 1 ;;
|
|
esac
|
|
shift
|
|
done
|
|
|
|
SYNC_SOURCE="$DATASET_DIR/.sync_source"
|
|
if [[ ! -f "$SYNC_SOURCE" ]]; then
|
|
echo "Error: $SYNC_SOURCE not found. Create it with REMOTE_URL and REMOTE_SUBDIR." >&2
|
|
exit 1
|
|
fi
|
|
|
|
# Load config
|
|
REMOTE_URL=$(grep '^REMOTE_URL=' "$SYNC_SOURCE" | cut -d= -f2-)
|
|
REMOTE_SUBDIR=$(grep '^REMOTE_SUBDIR=' "$SYNC_SOURCE" | cut -d= -f2-)
|
|
|
|
if [[ -z "$REMOTE_URL" ]]; then
|
|
echo "Error: REMOTE_URL not set in $SYNC_SOURCE" >&2
|
|
exit 1
|
|
fi
|
|
|
|
REMOTE_SUBDIR="${REMOTE_SUBDIR:-readings}"
|
|
JSON_DIR="$DATASET_DIR/json"
|
|
|
|
echo "========================================"
|
|
echo "Syncing: $DATASET_DIR"
|
|
echo "From: $REMOTE_URL/$REMOTE_SUBDIR"
|
|
echo "========================================"
|
|
|
|
# Clone remote to temp dir and copy JSON files
|
|
TMPDIR=$(mktemp -d)
|
|
trap "rm -rf '$TMPDIR'" EXIT
|
|
|
|
echo ""
|
|
echo "Fetching remote data..."
|
|
git clone --depth 1 --quiet "$REMOTE_URL" "$TMPDIR"
|
|
|
|
SRC="$TMPDIR/$REMOTE_SUBDIR"
|
|
if [[ ! -d "$SRC" ]]; then
|
|
echo "Error: subdirectory '$REMOTE_SUBDIR' not found in remote repo." >&2
|
|
exit 1
|
|
fi
|
|
|
|
NEW=$(find "$SRC" -name '*.json' | wc -l | tr -d ' ')
|
|
mkdir -p "$JSON_DIR"
|
|
cp "$SRC"/*.json "$JSON_DIR"/
|
|
echo "Copied $NEW JSON files → $JSON_DIR"
|
|
|
|
# Determine VENV python
|
|
PYTHON=python3
|
|
if [[ -f ".venv/bin/python3" ]]; then
|
|
PYTHON=".venv/bin/python3"
|
|
fi
|
|
|
|
# Regenerate CSV
|
|
echo ""
|
|
echo "Regenerating readings.csv..."
|
|
"$PYTHON" scripts/json_to_csv.py "$JSON_DIR" -o "$DATASET_DIR/readings.csv"
|
|
|
|
if [[ "$RUN_ANALYSIS" == true ]]; then
|
|
echo ""
|
|
echo "Running multivariate analysis (--min-coverage $MIN_COVERAGE)..."
|
|
"$PYTHON" scripts/multivariate_analysis.py \
|
|
"$DATASET_DIR/readings.csv" \
|
|
--min-coverage "$MIN_COVERAGE" \
|
|
--analyses clustering pca correlation importance
|
|
|
|
echo ""
|
|
echo "Generating LDA visualization..."
|
|
"$PYTHON" scripts/lda_visualization.py "$DATASET_DIR/readings.csv"
|
|
|
|
echo ""
|
|
echo "Classifying readings (training: $TRAINING_CSV)..."
|
|
"$PYTHON" scripts/classify_readings.py \
|
|
"$DATASET_DIR/readings.csv" \
|
|
--training "$TRAINING_CSV"
|
|
fi
|
|
|
|
echo ""
|
|
echo "Done. Dataset: $DATASET_DIR"
|