Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docker/comet/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
jars/
test_data/
spark-events/
59 changes: 59 additions & 0 deletions docker/comet/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Spark + Apache DataFusion Comet + DataFlint example runner
#
# Build arguments:
# SPARK_VERSION - Spark version (default: 3.5.7)
#
# Usage:
# ./run-comet-example.sh (recommended — builds everything and runs)
# docker compose up --build (if jars are already in jars/)

ARG SPARK_VERSION=3.5.7

FROM apache/spark:${SPARK_VERSION}

ARG SPARK_VERSION=3.5.7

USER root

# Create directories for event logs and test data
RUN mkdir -p /tmp/spark-events && \
chown -R spark:spark /tmp/spark-events && \
mkdir -p /opt/spark/work-dir/test_data && \
chown -R spark:spark /opt/spark/work-dir/test_data

# Copy all jars (Comet + DataFlint plugin + example) into Spark's jars dir
COPY jars/*.jar /opt/spark/jars/

# Copy test data
COPY test_data/ /opt/spark/work-dir/test_data/

# Configure Spark defaults for Comet.
# JVM module-access flags live in the _JAVA_OPTIONS env at the bottom of this Dockerfile;
# they apply to all JVMs spawned (driver + executors) on Java 11+.
RUN mkdir -p /opt/spark/conf && \
echo "spark.plugins=io.dataflint.spark.SparkDataflintPlugin,org.apache.spark.CometPlugin" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.comet.explainFallback.enabled=true" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.memory.offHeap.enabled=true" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.memory.offHeap.size=4g" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.eventLog.enabled=true" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.eventLog.dir=/tmp/spark-events" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.ui.port=10000" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.dataflint.telemetry.enabled=false" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.sql.maxMetadataStringLength=10000" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.sql.adaptive.enabled=true" >> /opt/spark/conf/spark-defaults.conf

USER spark

EXPOSE 10000

WORKDIR /opt/spark/work-dir

ENV _JAVA_OPTIONS="--add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED -Dio.netty.tryReflectionSetAccessible=true"

# Run the Comet example via spark-submit
CMD ["/opt/spark/bin/spark-submit", \
"--master", "local[*]", \
"--class", "io.dataflint.example.DataFusionCometExample", \
"--driver-memory", "2g", \
"/opt/spark/jars/example.jar"]
16 changes: 16 additions & 0 deletions docker/comet/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
services:
spark-comet-example:
build:
context: .
dockerfile: Dockerfile
args:
SPARK_VERSION: ${SPARK_VERSION:-3.5.7}
image: dataflint-comet-example:${SPARK_VERSION:-3.5.7}
container_name: dataflint-comet-example
ports:
- "${SPARK_UI_PORT:-10000}:10000"
volumes:
- ${SPARK_EVENTS_DIR:-./spark-events}:/tmp/spark-events
environment:
- SPARK_NO_DAEMONIZE=true
restart: "no"
146 changes: 146 additions & 0 deletions docker/comet/run-comet-example.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
#!/bin/bash
set -e

# Run DataFlint Apache DataFusion Comet Example
#
# This script:
# 1. Downloads the Apache Comet jar from Maven Central (cached)
# 2. Builds the DataFlint UI and plugin jar
# 3. Packages the Comet example app
# 4. Builds and runs the Docker container
#
# Apache Comet ships native libraries inside the single Maven jar (Linux x86_64 + aarch64).
# Running on macOS goes through Docker (Linux container) because the released jar does not
# include darwin natives — local `sbt run` will fail on macOS.
#
# Prerequisites: Node.js 20+, Java 8+, sbt, Docker
#
# Usage:
# ./run-comet-example.sh # full build + run
# ./run-comet-example.sh --skip-build # skip sbt/npm, just rebuild Docker
# ./run-comet-example.sh --amd64 # force x86_64 (Rosetta 2 emulation)

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
JARS_DIR="$SCRIPT_DIR/jars"
TEST_DATA_DIR="$SCRIPT_DIR/test_data"
SPARK_EVENTS_DIR="$SCRIPT_DIR/spark-events"

SPARK_VERSION="${SPARK_VERSION:-3.5.7}"
SCALA_VERSION="${SCALA_VERSION:-2.12}"
COMET_VERSION="${COMET_VERSION:-0.4.0}"

SKIP_BUILD=false
FORCE_AMD64=false

for arg in "$@"; do
case $arg in
--skip-build) SKIP_BUILD=true ;;
--amd64) FORCE_AMD64=true ;;
esac
done

# Apple Silicon can run the aarch64 natives that ship inside the Comet jar; --amd64 forces
# Rosetta emulation if you specifically need x86_64.
ARCH=$(uname -m)
if [ "$FORCE_AMD64" = true ]; then
DOCKER_PLATFORM="--platform linux/amd64"
elif [ "$ARCH" = "arm64" ] || [ "$ARCH" = "aarch64" ]; then
DOCKER_PLATFORM=""
else
DOCKER_PLATFORM=""
fi

COMET_JAR_NAME="comet-spark-spark3.5_${SCALA_VERSION}-${COMET_VERSION}.jar"
COMET_JAR_URL="https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_${SCALA_VERSION}/${COMET_VERSION}/${COMET_JAR_NAME}"

echo "=== DataFlint Apache Comet Example ==="
echo "Project root: $PROJECT_ROOT"
echo "Spark version: $SPARK_VERSION"
echo "Comet version: $COMET_VERSION"
echo "Comet jar: $COMET_JAR_NAME"
echo ""

mkdir -p "$JARS_DIR"
mkdir -p "$SPARK_EVENTS_DIR"

# --- Step 1: Download Comet jar (cached) ---
echo "=== Step 1: Downloading Comet jar ==="
if [ -f "$JARS_DIR/$COMET_JAR_NAME" ]; then
echo "Comet jar already cached: $JARS_DIR/$COMET_JAR_NAME"
else
echo "Downloading: $COMET_JAR_URL"
curl -fSL -o "$JARS_DIR/$COMET_JAR_NAME" "$COMET_JAR_URL"
echo "Downloaded successfully."
fi

if [ "$SKIP_BUILD" = false ]; then
# --- Step 2: Build DataFlint UI ---
echo ""
echo "=== Step 2: Building DataFlint UI ==="
cd "$PROJECT_ROOT/spark-ui"
if [ ! -d "node_modules" ]; then
echo "Installing npm dependencies..."
npm ci
fi
echo "Building and deploying UI into plugin resources..."
npm run deploy

# --- Step 3: Build DataFlint plugin jar ---
echo ""
echo "=== Step 3: Building DataFlint plugin jar ==="
cd "$PROJECT_ROOT/spark-plugin"
export SBT_OPTS="-Xmx4G -Xss2M -XX:+UseG1GC"
sbt "pluginspark3/assembly"

# --- Step 4: Package example jar ---
echo ""
echo "=== Step 4: Packaging example jar ==="
sbt "example_3_5_1/package"
fi

# --- Step 5: Copy jars to docker context ---
echo ""
echo "=== Step 5: Copying jars to Docker context ==="

# DataFlint plugin jar
PLUGIN_JAR=$(find "$PROJECT_ROOT/spark-plugin/pluginspark3/target/scala-${SCALA_VERSION}" -name "spark_${SCALA_VERSION}-*.jar" -type f | head -1)
if [ -z "$PLUGIN_JAR" ]; then
echo "ERROR: DataFlint plugin jar not found. Run without --skip-build first."
exit 1
fi
cp "$PLUGIN_JAR" "$JARS_DIR/dataflint-plugin.jar"
echo "Copied DataFlint plugin: $(basename "$PLUGIN_JAR")"

# Example jar
EXAMPLE_JAR=$(ls -t "$PROJECT_ROOT/spark-plugin/example_3_5_1/target/scala-${SCALA_VERSION}"/dataflintsparkexample351_${SCALA_VERSION}-*.jar 2>/dev/null | head -1)
if [ -z "$EXAMPLE_JAR" ]; then
echo "ERROR: Example jar not found. Run without --skip-build first."
exit 1
fi
cp "$EXAMPLE_JAR" "$JARS_DIR/example.jar"
echo "Copied example jar: $(basename "$EXAMPLE_JAR")"

echo "Comet jar: $COMET_JAR_NAME"

# --- Step 6: Copy test data ---
echo ""
echo "=== Step 6: Copying test data ==="
rm -rf "$TEST_DATA_DIR"
cp -r "$PROJECT_ROOT/spark-plugin/test_data" "$TEST_DATA_DIR"
echo "Copied test_data/"

# --- Step 7: Build and run Docker ---
echo ""
echo "=== Step 7: Building and running Docker container ==="
cd "$SCRIPT_DIR"

# Stop any previous container
docker compose down 2>/dev/null || true

# Build with platform flag if needed
if [ -n "$DOCKER_PLATFORM" ]; then
DOCKER_DEFAULT_PLATFORM=linux/amd64 docker compose up --build
else
docker compose up --build
fi
3 changes: 3 additions & 0 deletions docker/gluten/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
jars/
test_data/
spark-events/
59 changes: 59 additions & 0 deletions docker/gluten/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Spark + Gluten/Velox + DataFlint example runner
#
# Build arguments:
# SPARK_VERSION - Spark version (default: 3.5.7)
# GLUTEN_JAR - Filename of the Gluten bundle jar in jars/ directory
#
# Usage:
# ./run-gluten-example.sh (recommended — builds everything and runs)
# docker compose up --build (if jars are already in jars/)

ARG SPARK_VERSION=3.5.7

FROM apache/spark:${SPARK_VERSION}

ARG SPARK_VERSION=3.5.7

USER root

# Create directories for event logs and test data
RUN mkdir -p /tmp/spark-events && \
chown -R spark:spark /tmp/spark-events && \
mkdir -p /opt/spark/work-dir/test_data && \
chown -R spark:spark /opt/spark/work-dir/test_data

# Copy all jars (Gluten bundle + DataFlint plugin + example) into Spark's jars dir
COPY jars/*.jar /opt/spark/jars/

# Copy test data
COPY test_data/ /opt/spark/work-dir/test_data/

# Configure Spark defaults for Gluten.
# JVM module-access flags live in the _JAVA_OPTIONS env at the bottom of this Dockerfile;
# they apply to all JVMs spawned (driver + executors) on Java 11+.
RUN mkdir -p /opt/spark/conf && \
echo "spark.plugins=io.dataflint.spark.SparkDataflintPlugin,org.apache.gluten.GlutenPlugin" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.memory.offHeap.enabled=true" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.memory.offHeap.size=4g" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.eventLog.enabled=true" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.eventLog.dir=/tmp/spark-events" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.ui.port=10000" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.dataflint.telemetry.enabled=false" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.sql.maxMetadataStringLength=10000" >> /opt/spark/conf/spark-defaults.conf && \
echo "spark.sql.adaptive.enabled=true" >> /opt/spark/conf/spark-defaults.conf

USER spark

EXPOSE 10000

WORKDIR /opt/spark/work-dir

ENV _JAVA_OPTIONS="--add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED -Dio.netty.tryReflectionSetAccessible=true"

# Run the Gluten example via spark-submit
CMD ["/opt/spark/bin/spark-submit", \
"--master", "local[*]", \
"--class", "io.dataflint.example.GlutenVeloxExample", \
"--driver-memory", "2g", \
"/opt/spark/jars/example.jar"]
16 changes: 16 additions & 0 deletions docker/gluten/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
services:
spark-gluten-example:
build:
context: .
dockerfile: Dockerfile
args:
SPARK_VERSION: ${SPARK_VERSION:-3.5.7}
image: dataflint-gluten-example:${SPARK_VERSION:-3.5.7}
container_name: dataflint-gluten-example
ports:
- "${SPARK_UI_PORT:-10000}:10000"
volumes:
- ${SPARK_EVENTS_DIR:-./spark-events}:/tmp/spark-events
environment:
- SPARK_NO_DAEMONIZE=true
restart: "no"
Loading
Loading