dataflint · minskya · May 18, 2026 · Apr 11, 2026 · Apr 14, 2026 · May 17, 2026
diff --git a/docker/comet/.gitignore b/docker/comet/.gitignore
@@ -0,0 +1,3 @@
+jars/
+test_data/
+spark-events/
diff --git a/docker/comet/Dockerfile b/docker/comet/Dockerfile
@@ -0,0 +1,59 @@
+# Spark + Apache DataFusion Comet + DataFlint example runner
+#
+# Build arguments:
+#   SPARK_VERSION - Spark version (default: 3.5.7)
+#
+# Usage:
+#   ./run-comet-example.sh   (recommended — builds everything and runs)
+#   docker compose up --build (if jars are already in jars/)
+
+ARG SPARK_VERSION=3.5.7
+
+FROM apache/spark:${SPARK_VERSION}
+
+ARG SPARK_VERSION=3.5.7
+
+USER root
+
+# Create directories for event logs and test data
+RUN mkdir -p /tmp/spark-events && \
+    chown -R spark:spark /tmp/spark-events && \
+    mkdir -p /opt/spark/work-dir/test_data && \
+    chown -R spark:spark /opt/spark/work-dir/test_data
+
+# Copy all jars (Comet + DataFlint plugin + example) into Spark's jars dir
+COPY jars/*.jar /opt/spark/jars/
+
+# Copy test data
+COPY test_data/ /opt/spark/work-dir/test_data/
+
+# Configure Spark defaults for Comet.
+# JVM module-access flags live in the _JAVA_OPTIONS env at the bottom of this Dockerfile;
+# they apply to all JVMs spawned (driver + executors) on Java 11+.
+RUN mkdir -p /opt/spark/conf && \
+    echo "spark.plugins=io.dataflint.spark.SparkDataflintPlugin,org.apache.spark.CometPlugin" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.comet.explainFallback.enabled=true" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.memory.offHeap.enabled=true" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.memory.offHeap.size=4g" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.eventLog.enabled=true" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.eventLog.dir=/tmp/spark-events" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.ui.port=10000" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.dataflint.telemetry.enabled=false" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.sql.maxMetadataStringLength=10000" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.sql.adaptive.enabled=true" >> /opt/spark/conf/spark-defaults.conf
+
+USER spark
+
+EXPOSE 10000
+
+WORKDIR /opt/spark/work-dir
+
+ENV _JAVA_OPTIONS="--add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED -Dio.netty.tryReflectionSetAccessible=true"
+
+# Run the Comet example via spark-submit
+CMD ["/opt/spark/bin/spark-submit", \
+     "--master", "local[*]", \
+     "--class", "io.dataflint.example.DataFusionCometExample", \
+     "--driver-memory", "2g", \
+     "/opt/spark/jars/example.jar"]
diff --git a/docker/comet/docker-compose.yml b/docker/comet/docker-compose.yml
@@ -0,0 +1,16 @@
+services:
+  spark-comet-example:
+    build:
+      context: .
+      dockerfile: Dockerfile
+      args:
+        SPARK_VERSION: ${SPARK_VERSION:-3.5.7}
+    image: dataflint-comet-example:${SPARK_VERSION:-3.5.7}
+    container_name: dataflint-comet-example
+    ports:
+      - "${SPARK_UI_PORT:-10000}:10000"
+    volumes:
+      - ${SPARK_EVENTS_DIR:-./spark-events}:/tmp/spark-events
+    environment:
+      - SPARK_NO_DAEMONIZE=true
+    restart: "no"
diff --git a/docker/comet/run-comet-example.sh b/docker/comet/run-comet-example.sh
@@ -0,0 +1,146 @@
+#!/bin/bash
+set -e
+
+# Run DataFlint Apache DataFusion Comet Example
+#
+# This script:
+#   1. Downloads the Apache Comet jar from Maven Central (cached)
+#   2. Builds the DataFlint UI and plugin jar
+#   3. Packages the Comet example app
+#   4. Builds and runs the Docker container
+#
+# Apache Comet ships native libraries inside the single Maven jar (Linux x86_64 + aarch64).
+# Running on macOS goes through Docker (Linux container) because the released jar does not
+# include darwin natives — local `sbt run` will fail on macOS.
+#
+# Prerequisites: Node.js 20+, Java 8+, sbt, Docker
+#
+# Usage:
+#   ./run-comet-example.sh              # full build + run
+#   ./run-comet-example.sh --skip-build # skip sbt/npm, just rebuild Docker
+#   ./run-comet-example.sh --amd64      # force x86_64 (Rosetta 2 emulation)
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+JARS_DIR="$SCRIPT_DIR/jars"
+TEST_DATA_DIR="$SCRIPT_DIR/test_data"
+SPARK_EVENTS_DIR="$SCRIPT_DIR/spark-events"
+
+SPARK_VERSION="${SPARK_VERSION:-3.5.7}"
+SCALA_VERSION="${SCALA_VERSION:-2.12}"
+COMET_VERSION="${COMET_VERSION:-0.4.0}"
+
+SKIP_BUILD=false
+FORCE_AMD64=false
+
+for arg in "$@"; do
+  case $arg in
+    --skip-build) SKIP_BUILD=true ;;
+    --amd64) FORCE_AMD64=true ;;
+  esac
+done
+
+# Apple Silicon can run the aarch64 natives that ship inside the Comet jar; --amd64 forces
+# Rosetta emulation if you specifically need x86_64.
+ARCH=$(uname -m)
+if [ "$FORCE_AMD64" = true ]; then
+  DOCKER_PLATFORM="--platform linux/amd64"
+elif [ "$ARCH" = "arm64" ] || [ "$ARCH" = "aarch64" ]; then
+  DOCKER_PLATFORM=""
+else
+  DOCKER_PLATFORM=""
+fi
+
+COMET_JAR_NAME="comet-spark-spark3.5_${SCALA_VERSION}-${COMET_VERSION}.jar"
+COMET_JAR_URL="https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_${SCALA_VERSION}/${COMET_VERSION}/${COMET_JAR_NAME}"
+
+echo "=== DataFlint Apache Comet Example ==="
+echo "Project root:  $PROJECT_ROOT"
+echo "Spark version: $SPARK_VERSION"
+echo "Comet version: $COMET_VERSION"
+echo "Comet jar:     $COMET_JAR_NAME"
+echo ""
+
+mkdir -p "$JARS_DIR"
+mkdir -p "$SPARK_EVENTS_DIR"
+
+# --- Step 1: Download Comet jar (cached) ---
+echo "=== Step 1: Downloading Comet jar ==="
+if [ -f "$JARS_DIR/$COMET_JAR_NAME" ]; then
+  echo "Comet jar already cached: $JARS_DIR/$COMET_JAR_NAME"
+else
+  echo "Downloading: $COMET_JAR_URL"
+  curl -fSL -o "$JARS_DIR/$COMET_JAR_NAME" "$COMET_JAR_URL"
+  echo "Downloaded successfully."
+fi
+
+if [ "$SKIP_BUILD" = false ]; then
+  # --- Step 2: Build DataFlint UI ---
+  echo ""
+  echo "=== Step 2: Building DataFlint UI ==="
+  cd "$PROJECT_ROOT/spark-ui"
+  if [ ! -d "node_modules" ]; then
+    echo "Installing npm dependencies..."
+    npm ci
+  fi
+  echo "Building and deploying UI into plugin resources..."
+  npm run deploy
+
+  # --- Step 3: Build DataFlint plugin jar ---
+  echo ""
+  echo "=== Step 3: Building DataFlint plugin jar ==="
+  cd "$PROJECT_ROOT/spark-plugin"
+  export SBT_OPTS="-Xmx4G -Xss2M -XX:+UseG1GC"
+  sbt "pluginspark3/assembly"
+
+  # --- Step 4: Package example jar ---
+  echo ""
+  echo "=== Step 4: Packaging example jar ==="
+  sbt "example_3_5_1/package"
+fi
+
+# --- Step 5: Copy jars to docker context ---
+echo ""
+echo "=== Step 5: Copying jars to Docker context ==="
+
+# DataFlint plugin jar
+PLUGIN_JAR=$(find "$PROJECT_ROOT/spark-plugin/pluginspark3/target/scala-${SCALA_VERSION}" -name "spark_${SCALA_VERSION}-*.jar" -type f | head -1)
+if [ -z "$PLUGIN_JAR" ]; then
+  echo "ERROR: DataFlint plugin jar not found. Run without --skip-build first."
+  exit 1
+fi
+cp "$PLUGIN_JAR" "$JARS_DIR/dataflint-plugin.jar"
+echo "Copied DataFlint plugin: $(basename "$PLUGIN_JAR")"
+
+# Example jar
+EXAMPLE_JAR=$(ls -t "$PROJECT_ROOT/spark-plugin/example_3_5_1/target/scala-${SCALA_VERSION}"/dataflintsparkexample351_${SCALA_VERSION}-*.jar 2>/dev/null | head -1)
+if [ -z "$EXAMPLE_JAR" ]; then
+  echo "ERROR: Example jar not found. Run without --skip-build first."
+  exit 1
+fi
+cp "$EXAMPLE_JAR" "$JARS_DIR/example.jar"
+echo "Copied example jar: $(basename "$EXAMPLE_JAR")"
+
+echo "Comet jar: $COMET_JAR_NAME"
+
+# --- Step 6: Copy test data ---
+echo ""
+echo "=== Step 6: Copying test data ==="
+rm -rf "$TEST_DATA_DIR"
+cp -r "$PROJECT_ROOT/spark-plugin/test_data" "$TEST_DATA_DIR"
+echo "Copied test_data/"
+
+# --- Step 7: Build and run Docker ---
+echo ""
+echo "=== Step 7: Building and running Docker container ==="
+cd "$SCRIPT_DIR"
+
+# Stop any previous container
+docker compose down 2>/dev/null || true
+
+# Build with platform flag if needed
+if [ -n "$DOCKER_PLATFORM" ]; then
+  DOCKER_DEFAULT_PLATFORM=linux/amd64 docker compose up --build
+else
+  docker compose up --build
+fi
diff --git a/docker/gluten/.gitignore b/docker/gluten/.gitignore
@@ -0,0 +1,3 @@
+jars/
+test_data/
+spark-events/
diff --git a/docker/gluten/Dockerfile b/docker/gluten/Dockerfile
@@ -0,0 +1,59 @@
+# Spark + Gluten/Velox + DataFlint example runner
+#
+# Build arguments:
+#   SPARK_VERSION - Spark version (default: 3.5.7)
+#   GLUTEN_JAR    - Filename of the Gluten bundle jar in jars/ directory
+#
+# Usage:
+#   ./run-gluten-example.sh   (recommended — builds everything and runs)
+#   docker compose up --build (if jars are already in jars/)
+
+ARG SPARK_VERSION=3.5.7
+
+FROM apache/spark:${SPARK_VERSION}
+
+ARG SPARK_VERSION=3.5.7
+
+USER root
+
+# Create directories for event logs and test data
+RUN mkdir -p /tmp/spark-events && \
+    chown -R spark:spark /tmp/spark-events && \
+    mkdir -p /opt/spark/work-dir/test_data && \
+    chown -R spark:spark /opt/spark/work-dir/test_data
+
+# Copy all jars (Gluten bundle + DataFlint plugin + example) into Spark's jars dir
+COPY jars/*.jar /opt/spark/jars/
+
+# Copy test data
+COPY test_data/ /opt/spark/work-dir/test_data/
+
+# Configure Spark defaults for Gluten.
+# JVM module-access flags live in the _JAVA_OPTIONS env at the bottom of this Dockerfile;
+# they apply to all JVMs spawned (driver + executors) on Java 11+.
+RUN mkdir -p /opt/spark/conf && \
+    echo "spark.plugins=io.dataflint.spark.SparkDataflintPlugin,org.apache.gluten.GlutenPlugin" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.memory.offHeap.enabled=true" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.memory.offHeap.size=4g" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.eventLog.enabled=true" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.eventLog.dir=/tmp/spark-events" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.ui.port=10000" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.dataflint.telemetry.enabled=false" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.sql.maxMetadataStringLength=10000" >> /opt/spark/conf/spark-defaults.conf && \
+    echo "spark.sql.adaptive.enabled=true" >> /opt/spark/conf/spark-defaults.conf
+
+USER spark
+
+EXPOSE 10000
+
+WORKDIR /opt/spark/work-dir
+
+ENV _JAVA_OPTIONS="--add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED -Dio.netty.tryReflectionSetAccessible=true"
+
+# Run the Gluten example via spark-submit
+CMD ["/opt/spark/bin/spark-submit", \
+     "--master", "local[*]", \
+     "--class", "io.dataflint.example.GlutenVeloxExample", \
+     "--driver-memory", "2g", \
+     "/opt/spark/jars/example.jar"]
diff --git a/docker/gluten/docker-compose.yml b/docker/gluten/docker-compose.yml
@@ -0,0 +1,16 @@
+services:
+  spark-gluten-example:
+    build:
+      context: .
+      dockerfile: Dockerfile
+      args:
+        SPARK_VERSION: ${SPARK_VERSION:-3.5.7}
+    image: dataflint-gluten-example:${SPARK_VERSION:-3.5.7}
+    container_name: dataflint-gluten-example
+    ports:
+      - "${SPARK_UI_PORT:-10000}:10000"
+    volumes:
+      - ${SPARK_EVENTS_DIR:-./spark-events}:/tmp/spark-events
+    environment:
+      - SPARK_NO_DAEMONIZE=true
+    restart: "no"