diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ab0c0c2..f678ad3 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -9,6 +9,7 @@ on: # yamllint disable-line rule:truthy
 
 jobs:
   build:
+    name: Build (${{ matrix.os }} ${{ matrix.build_type }}, CUDA=${{ matrix.cuda_version }})
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
@@ -16,6 +17,7 @@ jobs:
         # Testing across Linux and macOS
         os: [ubuntu-latest, macos-latest]
         build_type: [Debug, Release]
+        cuda_version: [cpu] # Cuda builds are tested on our containers
 
     steps:
       - uses: actions/checkout@v4
@@ -31,12 +33,12 @@ jobs:
         run: brew install libomp
 
       - name: Build TGN
-        run: make build BUILD_TYPE=${{ matrix.build_type }}
+        run: make build BUILD_TYPE=${{ matrix.build_type }} CUDA_VERSION=${{ matrix.cuda_version }}
 
       - name: Upload Build Artifacts
         uses: actions/upload-artifact@v4
         with:
-          name: build-${{ matrix.os }}-${{ matrix.build_type }}
+          name: build-${{ matrix.os }}-${{ matrix.cuda_version }}-${{ matrix.build_type }}
           path: build/
 
   test:
@@ -51,10 +53,10 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
-      - name: Download Debug Build
+      - name: Download CPU Debug Build
         uses: actions/download-artifact@v4
         with:
-          name: build-${{ matrix.os }}-Debug
+          name: build-${{ matrix.os }}-cpu-Debug
           path: build/
 
       - name: Install OpenMP (MacOS)
@@ -70,10 +72,10 @@ jobs:
     steps:
       - uses: actions/checkout@v5
 
-      - name: Download Debug Build
+      - name: Download CPU Debug Build
         uses: actions/download-artifact@v4
         with:
-          name: build-ubuntu-latest-Debug
+          name: build-ubuntu-latest-cpu-Debug
           path: build/
 
       - uses: cpp-linter/cpp-linter-action@v2
@@ -86,18 +88,22 @@ jobs:
           tidy-checks: "" # Use .clang-tidy config file
 
   test-on-container:
+    name: Container Build (Cuda=${{ matrix.cuda_version }})
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        cuda_version: [cpu, 12.6, 12.8, 13.0]
     steps:
       - uses: actions/checkout@v4
 
       - name: Build TGN Container
-        run: docker build -t tgn-dev .
+        run: docker build --build-arg CUDA_VERSION=${{ matrix.cuda_version }} -t tgn-dev:${{ matrix.cuda_version }} .
 
       - name: Run Tests inside Container
         run: |
-          docker run --rm \
-          -v "$(pwd):/workspace:Z" \
-            tgn-dev /bin/bash -c "make test"
+          # If CPU, run tests. If CUDA, just run build to verify compilation.
+          CMD=$([[ "${{ matrix.cuda_version }}" == "cpu" ]] && echo "make test" || echo "make build")
+          docker run --rm -v "$(pwd):/workspace:Z" tgn-dev:${{ matrix.cuda_version }} /bin/bash -c "$CMD"
 
   python-tests:
     needs: build
@@ -111,10 +117,10 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
-      - name: Download Build Artifacts
+      - name: Download CPU Release Build
         uses: actions/download-artifact@v4
         with:
-          name: build-${{ matrix.os }}-Release
+          name: build-${{ matrix.os }}-cpu-Release
           path: build/
 
       - name: Setup uv
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
index 1d9138d..3dd75b1 100644
--- a/.github/workflows/integration.yml
+++ b/.github/workflows/integration.yml
@@ -10,12 +10,16 @@ on: # yamllint disable-line rule:truthy
 
 jobs:
   build-all:
+    name: Build (${{ matrix.os }}, CUDA=${{ matrix.cuda_version }})
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
         # Testing across Linux and macOS
         os: [ubuntu-latest, macos-latest]
+        build_type: [Release]
+        cuda_version: [cpu] # Cuda builds are tested on our containers
+
     steps:
       - uses: actions/checkout@v4
 
@@ -30,12 +34,12 @@ jobs:
         run: brew install libomp
 
       - name: Build TGN Examples
-        run: make examples BUILD_TYPE=Release
+        run: make examples BUILD_TYPE=Release CUDA_VERSION=${{ matrix.cuda_version }}
 
       - name: Upload Build Artifact
         uses: actions/upload-artifact@v4
         with:
-          name: build-${{ matrix.os }}-Release
+          name: build-${{ matrix.os }}-${{ matrix.cuda_version }}-Release
           path: build/
 
   link-pred:
@@ -53,7 +57,7 @@ jobs:
       - name: Download Build Artifact
         uses: actions/download-artifact@v4
         with:
-          name: build-${{ matrix.os }}-Release
+          name: build-${{ matrix.os }}-cpu-Release
           path: build/
 
       - name: Restore Binary Permissions
@@ -87,7 +91,7 @@ jobs:
       - name: Download Build Artifact
         uses: actions/download-artifact@v4
         with:
-          name: build-${{ matrix.os }}-Release
+          name: build-${{ matrix.os }}-cpu-Release
           path: build/
 
       - name: Restore Binary Permissions
@@ -107,15 +111,19 @@ jobs:
         run: make run-node-${{ matrix.dataset }}
 
   node-pred-on-container:
+    name: Container Integration (CUDA=${{ matrix.cuda_version }})
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        cuda_version: [cpu, 12.6, 12.8, 13.0]
     steps:
       - uses: actions/checkout@v4
 
       - name: Build TGN Container
-        run: docker build -t tgn-dev .
+        run: docker build --build-arg CUDA_VERSION=${{ matrix.cuda_version }} -t tgn-dev:${{ matrix.cuda_version }} .
 
       - name: Run Node Prediction
         run: |
-          docker run --rm \
-            -v "$(pwd):/workspace:Z" \
-            tgn-dev /bin/bash -c "make run-node-tgbn-trade"
+          # If CPU: run full node prediction. If CUDA: just verify 'make examples' compiles.
+          CMD=$([[ "${{ matrix.cuda_version }}" == "cpu" ]] && echo "make run-node-tgbn-trade" || echo "make examples")
+          docker run --rm -v "$(pwd):/workspace:Z" tgn-dev:${{ matrix.cuda_version }} /bin/bash -c "$CMD"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c434c6a..2615769 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,10 +26,22 @@ add_compile_options(
 
 include(FetchContent)
 
+set(CUDA_VERSION "cpu" CACHE STRING "CUDA version (cpu, 12.6, 12.8, 13.0)")
+
 if(APPLE) # Target Apple Silicon (M1/M2/M3)
     set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-2.10.0.zip")
+    message(STATUS "TGUF: Target System is APPLE (ARM64). Using CPU LibTorch.")
 else() # Target Linux x86_64
-    set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-2.10.0%2Bcpu.zip")
+    if(CUDA_VERSION STREQUAL "cpu")
+        set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-2.10.0%2Bcpu.zip")
+        message(STATUS "TGUF: Target System is LINUX (CPU).")
+    else()
+        # Clean "12.6" -> "126" for the URL mapping
+        string(REPLACE "." "" CUDA_TAG ${CUDA_VERSION})
+        set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu${CUDA_TAG}/libtorch-shared-with-deps-2.10.0%2Bcu${CUDA_TAG}.zip")
+        message(STATUS "TGUF: Target System is LINUX (CUDA ${CUDA_VERSION}).")
+        enable_language(CUDA)
+    endif()
 endif()
 
 FetchContent_Declare(
diff --git a/Dockerfile b/Dockerfile
index 8711ff5..521ead5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,6 @@
 FROM ubuntu:24.04
 
+ARG CUDA_VERSION=cpu # Default is "cpu". Pass "12.6", "12.8", or "13.0" to trigger CUDA install.
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update && apt-get install -y \
@@ -11,12 +12,23 @@ RUN apt-get update && apt-get install -y \
     make \
     git \
     curl \
+    wget \
     ca-certificates \
+    && \
+    if [ "$CUDA_VERSION" != "cpu" ]; then \
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb && \
+        dpkg -i cuda-keyring_1.1-1_all.deb && \
+        apt-get update && \
+        PACKAGE_SUFFIX=$(echo $CUDA_VERSION | sed 's/\./-/g') && \
+        apt-get install -y cuda-toolkit-${PACKAGE_SUFFIX} ; \
+    fi \
     && rm -rf /var/lib/apt/lists/*
 
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /usr/local/bin/
 
 ENV CC=clang
 ENV CXX=clang++
+ENV PATH=${CUDA_VERSION:+/usr/local/cuda-${CUDA_VERSION}/bin:}${PATH}
+ENV LD_LIBRARY_PATH=${CUDA_VERSION:+/usr/local/cuda-${CUDA_VERSION}/lib64:}${LD_LIBRARY_PATH}
 
 WORKDIR /workspace
diff --git a/Makefile b/Makefile
index dbd0205..8d4858c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,27 @@
 BUILD_DIR := build
 PROFILE_DIR := build-profile
-CMAKE_FLAGS := -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+
+CUDA_VERSION ?= cpu
+GPU_ARCH ?= native
+
+CMAKE_FLAGS := -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCUDA_VERSION=$(CUDA_VERSION)
+
+ifneq ($(CUDA_VERSION), cpu)
+    CMAKE_FLAGS += -DCMAKE_CUDA_ARCHITECTURES=$(GPU_ARCH)
+
+    # Add Torch-specific Arch list (converts 80 -> 8.0)
+    ifneq ($(GPU_ARCH), native)
+        TORCH_ARCH := $(shell echo $(GPU_ARCH) | sed 's/\([0-9]\)\([0-9]\)/\1.\2/')
+        CMAKE_FLAGS += -DTORCH_CUDA_ARCH_LIST="$(TORCH_ARCH)"
+    endif
+
+    # Handle CUDA Compiler Path (Look in standard /usr/local/cuda-X.Y)
+    CUDA_PATH := /usr/local/cuda-$(CUDA_VERSION)
+    ifneq ("$(wildcard $(CUDA_PATH)/bin/nvcc)","")
+        CMAKE_FLAGS += -DCMAKE_CUDA_COMPILER=$(CUDA_PATH)/bin/nvcc
+    endif
+endif
+
 NPROCS := $(shell nproc 2>/dev/null || sysctl -n hw.logicalcpu)
 
 EXAMPLE_LINK := $(BUILD_DIR)/examples/tgn_link_pred
@@ -29,6 +50,10 @@ help:
 	@echo "  make examples            - Build tgn_link_prop and tgn_node_prop examples"
 	@echo "  make clean               - Remove build directory"
 	@echo ""
+	@echo "Build Parameters (Optional):"
+	@echo "  CUDA_VERSION=<ver>       - Build for CUDA (e.g., 12.6, 12.8, 13.0). Default: cpu"
+	@echo "  GPU_ARCH=<arch>          - Compute capability (e.g., 80, 90, native). Default: native"
+	@echo ""
 	@echo "Documentation Targets:"
 	@echo "  make docs                - Build project documentation"
 	@echo "  make docs-serve          - Build and serve project documentation"
diff --git a/README.md b/README.md
index 79252f6..c020d1e 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,9 @@
 ![Clang](https://img.shields.io/badge/Compiler-Clang-orange?style=flat&labelColor=white&logo=clang&logoColor=black)
 ![Linux](https://img.shields.io/badge/Linux-FCC624?style=flat&logo=linux&logoColor=black)
 ![macOS](https://img.shields.io/badge/macOS-000000?style=flat&logo=apple&logoColor=white)
+![CUDA 12.6](https://img.shields.io/badge/CUDA-12.6-76B900?style=flat&labelColor=white&logo=nvidia&logoColor=76B900)
+![CUDA 12.8](https://img.shields.io/badge/CUDA-12.8-76B900?style=flat&labelColor=white&logo=nvidia&logoColor=76B900)
+![CUDA 13.0](https://img.shields.io/badge/CUDA-13.0-76B900?style=flat&labelColor=white&logo=nvidia&logoColor=76B900)
 [![Docs](https://img.shields.io/readthedocs/tgncpp?style=flat&label=Docs&labelColor=white&logo=readthedocs&logoColor=black)](https://tgncpp.readthedocs.io/en/latest/?badge=latest)
 [![Tests](https://img.shields.io/github/actions/workflow/status/Jacob-Chmura/tgn.cpp/ci.yml?label=Tests&style=flat&labelColor=white&logo=github-actions&logoColor=black)](https://github.com/Jacob-Chmura/tgn.cpp/actions/workflows/ci.yml)
 
@@ -33,7 +36,17 @@ A C++20 Port of [TGN](https://arxiv.org/abs/2006.10637) over pure LibTorch:
 
 ### Installation
 
-You should just use the [Dockerfile](./Dockerfile), but if you prefer to install dependencies manually:
+You should just use the [Dockerfile](./Dockerfile):
+
+```sh
+# Build for CPU (default)
+docker build -t tgn-dev:cpu .
+
+# Build for specific CUDA drivers (e.g. 12.6 for A100/H100)
+docker build --build-arg CUDA_VERSION=12.6 -t tgn-dev:cu126 .
+```
+
+If you prefer a bare-metal install:
 
 ##### Linux
 
@@ -42,6 +55,8 @@ You should just use the [Dockerfile](./Dockerfile), but if you prefer to install
 sudo apt-get install -y clang libc++-dev libc++abi-dev
 ```
 
+If you want to run with CUDA, refer to [nvidia docs](https://developer.nvidia.com/cuda-12-6-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=20.04&target_type=deb_local) for nvidia toolkit installation.
+
 ##### MacOS
 
 ```sh
@@ -49,6 +64,16 @@ sudo apt-get install -y clang libc++-dev libc++abi-dev
 brew install cmake libomp
 ```
 
+> \[!Important\]
+> **Platform Support**:
+
+| OS    | **CUDA_VERSION**              | Default |
+| ----- | ----------------------------- | ------- |
+| Linux | `cpu`, `12.6`, `12.8`, `13.0` | `cpu`   |
+| macOS | `cpu`                         | `cpu`   |
+
+> **GPU_ARCH**: Specifies compute capability (e.g. `80`, `90`, `native`) for CUDA backend on Linux.
+
 ##### TGUF Conversion Scripts use [uv](https://docs.astral.sh/uv/):
 
 ```sh
@@ -57,15 +82,19 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
 
 ### Usage
 
-> \[!Note\]
-> Tested on Linux (Ubuntu 22.04+) and macOS (Apple Silicon)
+#### Setup
 
 ```sh
 # Clone the repo
 git clone git@github.com:Jacob-Chmura/tgn.cpp.git && cd tgn.cpp
 
-# See available targets
+# See all available targets
 make help
+```
+
+#### Running on CPU
+
+```sh
 
 # Download `tgbl-wiki` data, convert to `.tguf` and run examples/link_pred.cpp.
 make run-link-tgbl-wiki
@@ -73,3 +102,13 @@ make run-link-tgbl-wiki
 # Download `tgbn-trade` data, convert to `.tguf` and run examples/node_pred.cpp
 make run-node-tgbn-trade
 ```
+
+#### Running on GPU (Linux only)
+
+```sh
+# Example: Cuda 12.6 on an A100 (Arch 80)
+CUDA_VERSION=12.6 GPU_ARCH=80 make run-link-tgbl-wiki
+```
+
+> \[!TIP\]
+> Use `nvidia-smi` to check your **CUDA_VERSION** and **GPU_ARCH**