Files
protocol-bicorder/analysis/scripts/sync_readings.sh
Nathan Schneider 60e83783ec Flatten data/readings/ → data/
Remove the intermediate readings/ subdirectory level — dataset naming
(synthetic_YYYYMMDD, manual_YYYYMMDD) already encodes what the data is.
Update all path references across scripts and docs accordingly.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-20 17:46:23 -06:00

108 lines
3.0 KiB
Bash
Executable File

#!/usr/bin/env bash
# Sync a readings dataset from a remote git repository, then regenerate CSV and analysis.
#
# Reads remote URL and subdirectory from a .sync_source file in the dataset directory.
#
# Usage:
# scripts/sync_readings.sh data/manual_20260320
# scripts/sync_readings.sh data/manual_20260320 --no-analysis
# scripts/sync_readings.sh data/manual_20260320 --min-coverage 0.8
# scripts/sync_readings.sh data/manual_20260320 --training data/synthetic_20251116/readings.csv
#
# .sync_source format:
# REMOTE_URL=https://git.example.org/user/repo
# REMOTE_SUBDIR=readings
set -euo pipefail
DATASET_DIR="${1:?Usage: $0 <dataset_dir> [--no-analysis] [--min-coverage N]}"
RUN_ANALYSIS=true
MIN_COVERAGE=0.8
TRAINING_CSV="data/synthetic_20251116/readings.csv"
shift || true
while [[ $# -gt 0 ]]; do
case "$1" in
--no-analysis) RUN_ANALYSIS=false ;;
--min-coverage) MIN_COVERAGE="$2"; shift ;;
--training) TRAINING_CSV="$2"; shift ;;
*) echo "Unknown option: $1" >&2; exit 1 ;;
esac
shift
done
SYNC_SOURCE="$DATASET_DIR/.sync_source"
if [[ ! -f "$SYNC_SOURCE" ]]; then
echo "Error: $SYNC_SOURCE not found. Create it with REMOTE_URL and REMOTE_SUBDIR." >&2
exit 1
fi
# Load config
REMOTE_URL=$(grep '^REMOTE_URL=' "$SYNC_SOURCE" | cut -d= -f2-)
REMOTE_SUBDIR=$(grep '^REMOTE_SUBDIR=' "$SYNC_SOURCE" | cut -d= -f2-)
if [[ -z "$REMOTE_URL" ]]; then
echo "Error: REMOTE_URL not set in $SYNC_SOURCE" >&2
exit 1
fi
REMOTE_SUBDIR="${REMOTE_SUBDIR:-readings}"
JSON_DIR="$DATASET_DIR/json"
echo "========================================"
echo "Syncing: $DATASET_DIR"
echo "From: $REMOTE_URL/$REMOTE_SUBDIR"
echo "========================================"
# Clone remote to temp dir and copy JSON files
TMPDIR=$(mktemp -d)
trap "rm -rf '$TMPDIR'" EXIT
echo ""
echo "Fetching remote data..."
git clone --depth 1 --quiet "$REMOTE_URL" "$TMPDIR"
SRC="$TMPDIR/$REMOTE_SUBDIR"
if [[ ! -d "$SRC" ]]; then
echo "Error: subdirectory '$REMOTE_SUBDIR' not found in remote repo." >&2
exit 1
fi
NEW=$(find "$SRC" -name '*.json' | wc -l | tr -d ' ')
mkdir -p "$JSON_DIR"
cp "$SRC"/*.json "$JSON_DIR"/
echo "Copied $NEW JSON files → $JSON_DIR"
# Determine VENV python
PYTHON=python3
if [[ -f ".venv/bin/python3" ]]; then
PYTHON=".venv/bin/python3"
fi
# Regenerate CSV
echo ""
echo "Regenerating readings.csv..."
"$PYTHON" scripts/json_to_csv.py "$JSON_DIR" -o "$DATASET_DIR/readings.csv"
if [[ "$RUN_ANALYSIS" == true ]]; then
echo ""
echo "Running multivariate analysis (--min-coverage $MIN_COVERAGE)..."
"$PYTHON" scripts/multivariate_analysis.py \
"$DATASET_DIR/readings.csv" \
--min-coverage "$MIN_COVERAGE" \
--analyses clustering pca correlation importance
echo ""
echo "Generating LDA visualization..."
"$PYTHON" scripts/lda_visualization.py "$DATASET_DIR/readings.csv"
echo ""
echo "Classifying readings (training: $TRAINING_CSV)..."
"$PYTHON" scripts/classify_readings.py \
"$DATASET_DIR/readings.csv" \
--training "$TRAINING_CSV"
fi
echo ""
echo "Done. Dataset: $DATASET_DIR"