#!/usr/bin/env bash # Sync a readings dataset from a remote git repository, then regenerate CSV and analysis. # # Reads remote URL and subdirectory from a .sync_source file in the dataset directory. # # Usage: # scripts/sync_readings.sh data/manual_20260320 # scripts/sync_readings.sh data/manual_20260320 --no-analysis # scripts/sync_readings.sh data/manual_20260320 --min-coverage 0.8 # scripts/sync_readings.sh data/manual_20260320 --training data/synthetic_20251116/readings.csv # # .sync_source format: # REMOTE_URL=https://git.example.org/user/repo # REMOTE_SUBDIR=readings set -euo pipefail DATASET_DIR="${1:?Usage: $0 [--no-analysis] [--min-coverage N]}" RUN_ANALYSIS=true MIN_COVERAGE=0.8 TRAINING_CSV="data/synthetic_20251116/readings.csv" shift || true while [[ $# -gt 0 ]]; do case "$1" in --no-analysis) RUN_ANALYSIS=false ;; --min-coverage) MIN_COVERAGE="$2"; shift ;; --training) TRAINING_CSV="$2"; shift ;; *) echo "Unknown option: $1" >&2; exit 1 ;; esac shift done SYNC_SOURCE="$DATASET_DIR/.sync_source" if [[ ! -f "$SYNC_SOURCE" ]]; then echo "Error: $SYNC_SOURCE not found. Create it with REMOTE_URL and REMOTE_SUBDIR." >&2 exit 1 fi # Load config REMOTE_URL=$(grep '^REMOTE_URL=' "$SYNC_SOURCE" | cut -d= -f2-) REMOTE_SUBDIR=$(grep '^REMOTE_SUBDIR=' "$SYNC_SOURCE" | cut -d= -f2-) if [[ -z "$REMOTE_URL" ]]; then echo "Error: REMOTE_URL not set in $SYNC_SOURCE" >&2 exit 1 fi REMOTE_SUBDIR="${REMOTE_SUBDIR:-readings}" JSON_DIR="$DATASET_DIR/json" echo "========================================" echo "Syncing: $DATASET_DIR" echo "From: $REMOTE_URL/$REMOTE_SUBDIR" echo "========================================" # Clone remote to temp dir and copy JSON files TMPDIR=$(mktemp -d) trap "rm -rf '$TMPDIR'" EXIT echo "" echo "Fetching remote data..." git clone --depth 1 --quiet "$REMOTE_URL" "$TMPDIR" SRC="$TMPDIR/$REMOTE_SUBDIR" if [[ ! -d "$SRC" ]]; then echo "Error: subdirectory '$REMOTE_SUBDIR' not found in remote repo." >&2 exit 1 fi NEW=$(find "$SRC" -name '*.json' | wc -l | tr -d ' ') mkdir -p "$JSON_DIR" cp "$SRC"/*.json "$JSON_DIR"/ echo "Copied $NEW JSON files → $JSON_DIR" # Determine VENV python PYTHON=python3 if [[ -f ".venv/bin/python3" ]]; then PYTHON=".venv/bin/python3" fi # Regenerate CSV echo "" echo "Regenerating readings.csv..." "$PYTHON" scripts/json_to_csv.py "$JSON_DIR" -o "$DATASET_DIR/readings.csv" if [[ "$RUN_ANALYSIS" == true ]]; then echo "" echo "Running multivariate analysis (--min-coverage $MIN_COVERAGE)..." "$PYTHON" scripts/multivariate_analysis.py \ "$DATASET_DIR/readings.csv" \ --min-coverage "$MIN_COVERAGE" \ --analyses clustering pca correlation importance echo "" echo "Generating LDA visualization..." "$PYTHON" scripts/lda_visualization.py "$DATASET_DIR/readings.csv" echo "" echo "Classifying readings (training: $TRAINING_CSV)..." "$PYTHON" scripts/classify_readings.py \ "$DATASET_DIR/readings.csv" \ --training "$TRAINING_CSV" fi echo "" echo "Done. Dataset: $DATASET_DIR"