diff --git a/scripts/validate-parquet b/scripts/validate-parquet new file mode 100755 index 000000000..817cb01ee --- /dev/null +++ b/scripts/validate-parquet @@ -0,0 +1,727 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Default paths +CORE_DB_DEFAULT="data/sqlite/core.db" +BUNDLES_DB_DEFAULT="data/sqlite/bundles.db" +PARQUET_DIR_1_DEFAULT="data/parquet-ts" +PARQUET_DIR_2_DEFAULT="data/parquet-bash" +OUTPUT_DIR_DEFAULT="data/validation-results" + +# Script variables +CORE_DB="$CORE_DB_DEFAULT" +BUNDLES_DB="$BUNDLES_DB_DEFAULT" +PARQUET_DIR_1="$PARQUET_DIR_1_DEFAULT" +PARQUET_DIR_2="$PARQUET_DIR_2_DEFAULT" +OUTPUT_DIR="$OUTPUT_DIR_DEFAULT" +MODE="full" +START_HEIGHT="" +END_HEIGHT="" +QUICK_MODE=false +VERBOSE=false +SAMPLE_RATE=0.01 # 1% sample by default +TIMESTAMP=$(date +"%Y%m%d-%H%M%S") +TEMP_DB="" +LOG_FILE="" + +usage() { + cat <> "$LOG_FILE" + fi +} + +check_dependencies() { + log "DEBUG" "Checking dependencies..." + + if ! command -v duckdb >/dev/null 2>&1; then + log "ERROR" "duckdb CLI is required but not found in PATH" + exit 2 + fi + + log "DEBUG" "All dependencies satisfied" +} + +setup_workspace() { + mkdir -p "$OUTPUT_DIR" + + TEMP_DB="$(mktemp -u /tmp/parquet_validation_XXXX.duckdb)" + LOG_FILE="$OUTPUT_DIR/validation-$TIMESTAMP.log" + + log "DEBUG" "Workspace setup complete" + log "DEBUG" "Temp database: $TEMP_DB" + log "DEBUG" "Log file: $LOG_FILE" +} + +cleanup() { + if [[ -n "$TEMP_DB" && -f "$TEMP_DB" ]]; then + rm -f "$TEMP_DB" "$TEMP_DB.wal" + fi +} + +validate_inputs() { + local errors=0 + + if [[ "$MODE" == "source" || "$MODE" == "full" ]]; then + if [[ ! -f "$CORE_DB" ]]; then + log "ERROR" "Core database not found: $CORE_DB" + ((errors++)) + fi + + if [[ ! -f "$BUNDLES_DB" ]]; then + log "ERROR" "Bundles database not found: $BUNDLES_DB" + ((errors++)) + fi + fi + + if [[ "$MODE" == "source" || "$MODE" == "full" ]]; then + if [[ ! -d "$PARQUET_DIR_1" ]]; then + log "ERROR" "Parquet directory 1 not found: $PARQUET_DIR_1" + ((errors++)) + fi + fi + + if [[ "$MODE" == "compare" || "$MODE" == "full" ]]; then + if [[ ! -d "$PARQUET_DIR_1" ]]; then + log "ERROR" "Parquet directory 1 not found: $PARQUET_DIR_1" + ((errors++)) + fi + + if [[ ! -d "$PARQUET_DIR_2" ]]; then + log "ERROR" "Parquet directory 2 not found: $PARQUET_DIR_2" + ((errors++)) + fi + fi + + if [[ -n "$START_HEIGHT" && -n "$END_HEIGHT" ]]; then + if ! [[ "$START_HEIGHT" =~ ^[0-9]+$ ]] || ! [[ "$END_HEIGHT" =~ ^[0-9]+$ ]]; then + log "ERROR" "Height values must be numeric" + ((errors++)) + elif (( START_HEIGHT > END_HEIGHT )); then + log "ERROR" "Start height cannot be greater than end height" + ((errors++)) + fi + fi + + if [[ "$errors" -gt 0 ]]; then + log "ERROR" "Validation failed with $errors error(s)" + exit 2 + fi +} + +setup_duckdb() { + log "DEBUG" "Setting up DuckDB environment..." + + # Create database and load schema + cat > /tmp/schema_setup.sql < "$json_report" <&2 + exit 2 + fi + shift 2 + ;; + --core-db) + CORE_DB="$2" + shift 2 + ;; + --bundles-db) + BUNDLES_DB="$2" + shift 2 + ;; + --dir1) + PARQUET_DIR_1="$2" + shift 2 + ;; + --dir2) + PARQUET_DIR_2="$2" + shift 2 + ;; + --output-dir) + OUTPUT_DIR="$2" + shift 2 + ;; + --start-height) + START_HEIGHT="$2" + shift 2 + ;; + --end-height) + END_HEIGHT="$2" + shift 2 + ;; + --quick) + QUICK_MODE=true + shift 1 + ;; + --verbose) + VERBOSE=true + shift 1 + ;; + --sample-rate) + SAMPLE_RATE="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 2 + ;; + esac +done + +# Set up trap for cleanup +trap cleanup EXIT + +# Run main function +main \ No newline at end of file