diff --git a/Jenkinsfile.modified b/Jenkinsfile.modified new file mode 100644 index 0000000..3201c28 --- /dev/null +++ b/Jenkinsfile.modified @@ -0,0 +1,192 @@ +/* + * LSTM — Inference Evaluation Pipeline + */ +pipeline { + agent any + + environment { + BUILD_CTX = "${WORKSPACE}" + METRICS_DIR = "${WORKSPACE}/eval_metrics/lstm" + + MODEL_NAME = "lstm" + IMAGE_GPU = "py-lstm-gpu" + + CONTAINER_NAME = "eval-lstm" + EVAL_PORT = "8002" + + PATH = "/var/lib/jenkins/.local/bin:/home/ajitesh/.local/bin:${env.PATH}" + } + + stages { + stage('Setup') { + steps { + sh "mkdir -p ${METRICS_DIR}" + } + } + + stage('Build Image') { + steps { + script { + buildImg(IMAGE_GPU, 'python_ml/pytorch/LSTM/Inference/Dockerfile') + } + } + } + + stage('Image Metrics') { + steps { + script { + measureImg(IMAGE_GPU) + } + } + } + + stage('Evaluate') { + steps { + script { + evaluate(IMAGE_GPU, EVAL_PORT) + } + } + } + } + + post { + always { + sh "docker rm -f ${CONTAINER_NAME} 2>/dev/null || true" + + archiveArtifacts artifacts: 'eval_metrics/lstm/**/*', allowEmptyArchive: true + } + } +} + +def buildImg(String image, String dockerfile) { + def t0 = System.currentTimeMillis() + def status = 'SUCCESS' + + try { + sh "docker build -t ${image} -f ${dockerfile} ${BUILD_CTX}" + } catch (Exception e) { + status = 'FAILURE' + throw e + } finally { + def dur = System.currentTimeMillis() - t0 + def ts = new Date().format("yyyy-MM-dd'T'HH:mm:ss'Z'") + + sh """ + cat > ${METRICS_DIR}/${image}_build.json </dev/null || echo 0) + LC=\$(docker history -q ${image} 2>/dev/null | wc -l) + SM=\$(echo "scale=2; \$SB / 1048576" | bc) + + cat > ${METRICS_DIR}/${image}_image.json </dev/null || true" + + sh """ + T0=\$(date +%s%3N) + + docker run -d \ + --name ${CONTAINER_NAME} \ + -p ${port}:8000 \ + -v ${METRICS_DIR}:/app/devops_metrics \ + ${image} + + T1=\$(date +%s%3N) + + echo "Container launched, waiting for readiness..." + + READY=0 + + for i in \$(seq 1 300); do + HEALTH_STATUS=\$(docker inspect -f '{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}' ${CONTAINER_NAME} 2>/dev/null || echo "missing") + + if [ "\$HEALTH_STATUS" = "healthy" ]; then + READY=1 + break + fi + + if curl -sf http://localhost:${port}/health > /dev/null 2>&1; then + READY=1 + break + fi + + sleep 0.1 + done + + T2=\$(date +%s%3N) + + CONTAINER_START_MS=\$((T1 - T0)) + APP_READY_MS=\$((T2 - T1)) + TOTAL_COLD_START_MS=\$((T2 - T0)) + + if [ "\$READY" -eq 1 ]; then + HEALTH_PAYLOAD=\$(curl -sf http://localhost:${port}/health 2>/dev/null || echo '{}') + + cat > ${METRICS_DIR}/cold_start.json < ${METRICS_DIR}/cold_start.json < ${METRICS_DIR}/app_metrics.json || echo '{}' > ${METRICS_DIR}/app_metrics.json + + docker stats --no-stream --format '{"dimension":"5_resources","cpu":"{{.CPUPerc}}","mem":"{{.MemUsage}}"}' ${CONTAINER_NAME} > ${METRICS_DIR}/docker_stats.json || true + + docker inspect ${CONTAINER_NAME} > ${METRICS_DIR}/container_inspect.json || true + """ + + sh "docker rm -f ${CONTAINER_NAME} 2>/dev/null || true" +} \ No newline at end of file diff --git a/latex_reports/draft_1.tex b/latex_reports/draft_1.tex new file mode 100644 index 0000000..042886f --- /dev/null +++ b/latex_reports/draft_1.tex @@ -0,0 +1,2235 @@ +\documentclass{ieeeaccess} +\usepackage{cite} +\usepackage{amsmath,amssymb,amsfonts} +\usepackage{algorithmic} +\usepackage{textcomp} + +\def\BibTeX{{\rm B\kern-.05em{\sc i\kern-.025em b}\kern-.08em + T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}} + +% Encoding and fonts +\usepackage[T1]{fontenc} +\usepackage[utf8]{inputenc} +\usepackage{pdfpages} + +% Math and graphics +\usepackage{graphicx} +\usepackage{booktabs} +\usepackage{array} +\usepackage{float} + +% URLs (robust line breaking) +\usepackage{url} +\usepackage[hidelinks]{hyperref} +\def\UrlBreaks{\do/\do-\do_} + +% Code listings (stable for IEEE) +\usepackage{listings} +\usepackage{xcolor} +\lstset{ + basicstyle=\ttfamily\footnotesize, + breaklines=true, + breakatwhitespace=false, % <-- IMPORTANT (change this) + columns=fullflexible, + keepspaces=true, + showstringspaces=false +} + +% Custom Dockerfile language +\lstdefinelanguage{Dockerfile}{ + keywords={FROM, RUN, CMD, LABEL, MAINTAINER, EXPOSE, ENV, ADD, COPY, ENTRYPOINT, VOLUME, USER, WORKDIR, ARG, ONBUILD, STOPSIGNAL, HEALTHCHECK, SHELL}, + sensitive=true, + comment=[l]{\#}, + morestring=[b]" +} + +\begin{document} +\history{Date of publication xxxx 00, 0000, date of current version xxxx 00, 0000.} +\doi{10.1109/ACCESS.2017.DOI} + +\title{System-Level Evaluation of Rust and Python for Machine Learning} +\author{\uppercase{Project Elective}\authorrefmark{1}, +\IEEEmembership{Member, IEEE}} +\address[1]{Project Elective (e-mail: project@elective.com)} +\tfootnote{This paragraph of the first footnote will contain support +information, including sponsor and financial support acknowledgment. For +example, ``This work was supported in part by the U.S. Department of +Commerce under Grant BS123456.''} + +\markboth +{Project Elective \headeretal: System-Level Evaluation of Rust and Python for Machine Learning} +{Project Elective \headeretal: System-Level Evaluation of Rust and Python for Machine Learning} + +\corresp{Corresponding author: Project Elective (e-mail: project@elective.com).} + +\begin{abstract} +These instructions give you guidelines for preparing papers for +IEEE Access. Use this document as a template if you are +using \LaTeX. Otherwise, use this document as an +instruction set. The electronic file of your paper will be formatted further +at IEEE. Paper titles should be written in uppercase and lowercase letters, +not all uppercase. Avoid writing long formulas with subscripts in the title; +short formulas that identify the elements are fine (e.g., "Nd--Fe--B"). Do +not write ``(Invited)'' in the title. Full names of authors are preferred in +the author field, but are not required. Put a space between authors' +initials. The abstract must be a concise yet comprehensive reflection of +what is in your article. In particular, the abstract must be self-contained, +without abbreviations, footnotes, or references. It should be a microcosm of +the full article. The abstract must be between 150--250 words. Be sure that +you adhere to these limits; otherwise, you will need to edit your abstract +accordingly. The abstract must be written as one paragraph, and should not +contain displayed mathematical equations or tabular material. The abstract +should include three or four different keywords or phrases, as this will +help readers to find it. It is important to avoid over-repetition of such +phrases as this can result in a page being rejected by search engines. +Ensure that your abstract reads well and is grammatically correct. +\end{abstract} + +\begin{keywords} +Enter key words or phrases in alphabetical +order, separated by commas. For a list of suggested keywords, send a blank +e-mail to keywords@ieee.org or visit \underline +{http://www.ieee.org/organizations/pubs/ani\_prod/keywrd98.txt} +\end{keywords} + +\titlepgskip=-15pt + +\maketitle +\section{Overview of the Project} + +This project studies the use of \textbf{Rust} as an alternative systems language for machine learning workflows traditionally implemented in \textbf{Python}. +Rather than focusing on state-of-the-art model performance, the emphasis is on: + +\begin{itemize} + \item feasibility of end-to-end ML workflows, + \item system stability and reproducibility, + \item developer experience and DevOps complexity, + \item deployment and operational characteristics. +\end{itemize} + +To ensure clarity and rigor, the work is organized into \textbf{two clearly separated experimental tracks}. + +\hrule + +\section{Project Structure: Two-Track Evaluation} + +The project consists of the following two tracks: + +\subsection*{Track 1: Training-Based Systems Evaluation} +This track compares \textbf{machine learning training pipelines} implemented in: +\begin{itemize} + \item PyTorch (Python), and + \item Burn (Rust). +\end{itemize} + +The goal is to evaluate training feasibility, stability, compile-time guarantees, and DevOps impact, rather than raw training speed. + +\subsection*{Track 2: Inference-Based DevOps Evaluation} +This track compares \textbf{production-style inference services} implemented in: +\begin{itemize} + \item Python-based ONNX inference, and + \item Rust-based ONNX inference. +\end{itemize} + +The focus is on deployment, security, containerization, CI/CD behavior, and runtime efficiency. + +Each track is designed to answer a distinct research question while remaining complementary. + +\section{Machine Learning Tasks Considered} + +To ensure coverage of diverse ML workloads, the following tasks are identified: + +\begin{itemize} + \item \textbf{Text Classification}: Dataset to be finalized. + \item \textbf{Image Classification}: MNIST dataset. + \item \textbf{Credit Score Assignment}: Supervised classification task. + \item \textbf{Multi-Objective Machine Learning}: Brain Tumor dataset with a MOML formulation. + \item \textbf{Fine-Tuning Task}: BERT-based classification (ANLP Assignment 1), with optional LoRA / QLoRA. + \item \textbf{Autoregressive Decoding}: Experiments using the Burn framework. +\end{itemize} + +At the current stage, the \textbf{MNIST image classification task has been fully implemented}. +The corresponding training code is available in the project GitHub repository. + +\hrule + +\section{Related Work} + +The following research papers are being used to guide experimental design and evaluation: + +\begin{itemize} + \item \url{https://ieeexplore.ieee.org/document/11126113} + \item \url{https://ieeexplore.ieee.org/document/11261485} + \item \url{https://ieeexplore.ieee.org/document/11212348} + \item \url{https://www.ijsred.com/volume8/issue2/IJSRED-V8I2P143.pdf} +\end{itemize} + +\hrule + +\section{Code Repository and Current Status} + +Project repository: +\begin{center} +\url{https://github.com/Abhinav-Kumar012/Rust_Python_ML_PE.git} +\end{center} + +Current progress includes: +\begin{itemize} + \item MNIST training pipeline implemented + \item PyTorch baseline established + \item Initial Rust (Burn) training setup completed +\end{itemize} + +\hrule + +\section{Track 1: Training-Based Systems Evaluation} + +\subsection{Objective} + +The objective of this track is to answer the following research question: + +\begin{quote} +\textit{Can Rust realistically support end-to-end machine learning training pipelines, and what system-level trade-offs does this introduce compared to PyTorch?} +\end{quote} + +This track explicitly avoids speed-centric benchmarking and instead focuses on system behavior. + +\hrule + +\subsection{Frameworks Compared} + +\subsubsection{PyTorch (Baseline)} +\begin{itemize} + \item Language: Python + \item Training maturity: Very high + \item Ecosystem: Extensive +\end{itemize} + +\subsubsection{Rust (Burn)} +\begin{itemize} + \item Language: Rust + \item Training maturity: Emerging + \item Design: Idiomatic Rust, native training support +\end{itemize} + +\hrule + +\subsection{Experimental Controls} + +\textbf{Fixed Across Both Implementations} +\begin{itemize} + \item Dataset splits + \item Number of epochs + \item Batch size + \item Optimizer type + \item Learning rate + \item Hardware +\end{itemize} + +\textbf{Allowed Differences} +\begin{itemize} + \item Internal kernel implementations + \item Graph execution model + \item Memory management +\end{itemize} + +\hrule + +\subsection{Metrics Collected} + +\begin{itemize} + \item Training time per epoch (reported cautiously) + \item Loss curves and convergence behavior + \item Runtime failures and numerical stability + \item Reproducibility across runs + \item Environment setup and build complexity + \item Dependency footprint and artifact size +\end{itemize} + +\hrule + +\section{Track 2: Inference-Based DevOps Evaluation} + +\subsection{Objective} + +The objective of this track is to compare \textbf{deployment, security, and operational characteristics} of Python-based and Rust-based ML inference services executing the same ONNX model. + +\hrule + +\subsection{Inference Services Compared} + +\textbf{Python Service} +\begin{itemize} + \item FastAPI + Uvicorn + \item ONNX Runtime (Python) +\end{itemize} + +\textbf{Rust Service} +\begin{itemize} + \item Axum / Actix + \item burn-rs +\end{itemize} + +Both services expose identical inference endpoints and return identical outputs. + +\hrule + +\subsection{Evaluation Dimensions} + +\begin{itemize} + \item CI/CD build behavior + \item Container image size and layering + \item Cold-start latency + \item Inference latency and throughput + \item Resource utilization + \item Security and supply-chain surface +\end{itemize} + +\hrule + +\section{Upcoming Work} + +The following tasks are planned for the next phase of the project: +s +\begin{itemize} + \item Develop production-style inference services for both Python and Rust. + \item Write Dockerfiles for Python and Rust inference services. + \item Set up Jenkins-based CI pipelines for inference, including build, test, containerization, and security scanning. +\end{itemize} + +\clearpage + +\section{Task: MNIST Image Classification} +\subsection{Architecture Details} + +\begin{table*}[t!] +\centering +\renewcommand{\arraystretch}{1.3} +\begin{tabular}{|c|l|l|l|l|} +\hline +\textbf{Step} & \textbf{Layer} & \textbf{Configuration} & \textbf{Input Shape} & \textbf{Output Shape} \\ +\hline + +1 & Input & Grayscale Images & +$[B, H, W]$ & +$[B, H, W]$ \\ + +\hline +2 & Reshape & Add channel dimension & +$[B, H, W]$ & +$[B, 1, H, W]$ \\ + +\hline +3 & Conv2D (conv1) & +$1 \rightarrow 8$, kernel $3 \times 3$ & +$[B, 1, H, W]$ & +$[B, 8, H-2, W-2]$ \\ + +\hline +4 & Dropout & +$p = 0.5$ & +$[B, 8, H-2, W-2]$ & +$[B, 8, H-2, W-2]$ \\ + +\hline +5 & Conv2D (conv2) & +$8 \rightarrow 16$, kernel $3 \times 3$ & +$[B, 8, H-2, W-2]$ & +$[B, 16, H-4, W-4]$ \\ + +\hline +6 & Dropout & +$p = 0.5$ & +$[B, 16, H-4, W-4]$ & +$[B, 16, H-4, W-4]$ \\ + +\hline +7 & ReLU & +Activation & +$[B, 16, H-4, W-4]$ & +$[B, 16, H-4, W-4]$ \\ + +\hline +8 & Adaptive Avg Pool & +Output size $8 \times 8$ & +$[B, 16, H-4, W-4]$ & +$[B, 16, 8, 8]$ \\ + +\hline +9 & Flatten & +$16 \times 8 \times 8$ & +$[B, 16, 8, 8]$ & +$[B, 1024]$ \\ + +\hline +10 & Linear (fc1) & +$1024 \rightarrow \texttt{hidden\_size}$ & +$[B, 1024]$ & +$[B, \texttt{hidden\_size}]$ \\ + +\hline +11 & Dropout & +$p = 0.5$ & +$[B, \texttt{hidden\_size}]$ & +$[B, \texttt{hidden\_size}]$ \\ + +\hline +12 & ReLU & +Activation & +$[B, \texttt{hidden\_size}]$ & +$[B, \texttt{hidden\_size}]$ \\ + +\hline +13 & Linear (fc2) & +$\texttt{hidden\_size} \rightarrow \texttt{num\_classes}$ & +$[B, \texttt{hidden\_size}]$ & +$[B, \texttt{num\_classes}]$ \\ + +\hline +\end{tabular} +\caption{Detailed architecture of the convolutional neural network implemented in Burn. +$B$ denotes batch size, $H$ and $W$ denote input image height and width respectively.} +\label{tab:burn-cnn-architecture} +\end{table*} + +\noindent\textbf{Notes:} +\begin{itemize} + \item All convolution layers use default stride = 1 and no padding. + \item Dropout probability is configurable via \texttt{ModelConfig.dropout}. + \item Adaptive average pooling ensures a fixed spatial resolution regardless of input size. + \item The model is fully differentiable and backend-agnostic via Burn's \texttt{Backend} trait. +\end{itemize} + + +\subsection{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{RustLocust/MNIST.pdf} + + +\subsection{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{PythonLocust/MNIST.pdf} + + +\subsection{Rust: Dockerfile Design and Containerization Strategy} + +The Dockerfile used for the Rust-based MNIST inference application follows a multi-stage build strategy. Multi-stage builds are commonly used to reduce the size of the final container image by separating the compilation environment from the runtime environment. + +\subsubsection{Overview of Multi-Stage Build} + +The Dockerfile is divided into two major stages: + +\begin{enumerate} + \item Builder Stage + \item Runtime Stage +\end{enumerate} + +The builder stage is responsible for compiling the Rust application, while the runtime stage contains only the compiled binary and the required runtime dependencies. + +\subsubsection{Builder Stage} + +The first stage begins with: + +\begin{lstlisting} +FROM ubuntu:16.04 AS builder +\end{lstlisting} + +This instruction uses Ubuntu 16.04 as the base image for building the Rust application. The alias \texttt{builder} is assigned to this stage so that its outputs can later be referenced in the runtime stage. + +\paragraph{Working Directory} + +\begin{lstlisting} +WORKDIR /app/rust_ml +\end{lstlisting} + +The \texttt{WORKDIR} instruction sets the default working directory inside the container to: + +\begin{lstlisting} +/app/rust_ml +\end{lstlisting} + +All subsequent commands in the builder stage are executed relative to this directory. + +\paragraph{Installing Build Dependencies} + +The following command installs the required packages for compiling the Rust project: + +\begin{lstlisting} +RUN apt-get update && apt-get install -y \ + curl \ + build-essential \ + pkg-config \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* +\end{lstlisting} + +Each package serves a specific purpose: + +\begin{itemize} + \item \texttt{curl}: Used to download external files, including the Rust installation script. + \item \texttt{build-essential}: Provides common compilation tools such as \texttt{gcc}, \texttt{g++}, and \texttt{make}. + \item \texttt{pkg-config}: Helps discover system libraries during the build process. + \item \texttt{ca-certificates}: Ensures secure HTTPS communication when downloading dependencies. +\end{itemize} + +The final cleanup command: + +\begin{lstlisting} +rm -rf /var/lib/apt/lists/* +\end{lstlisting} + +removes cached package lists to reduce image size. + +\paragraph{Installing Rust} + +Rust is installed using the official Rust installer: + +\begin{lstlisting} +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +\end{lstlisting} + +This command downloads and executes the \texttt{rustup} installer. + +The flags used have the following meanings: + +\begin{itemize} + \item \texttt{--proto '=https'}: Restricts downloads to HTTPS only. + \item \texttt{--tlsv1.2}: Forces the use of TLS version 1.2 for secure transport. + \item \texttt{-sSf}: Makes \texttt{curl} silent while still showing errors if the download fails. + \item \texttt{-y}: Automatically accepts all installation prompts. +\end{itemize} + +After Rust is installed, the PATH environment variable is updated: + +\begin{lstlisting} +ENV PATH="/root/.cargo/bin:${PATH}" +\end{lstlisting} + +This ensures that Rust tools such as \texttt{cargo} and \texttt{rustc} are available in subsequent commands. + +\paragraph{Copying Source Code} + +\begin{lstlisting} +COPY . . +\end{lstlisting} + +This instruction copies the entire project directory from the host system into the current working directory inside the container. + +\paragraph{Building the Application} + +\begin{lstlisting} +RUN cargo build --release -p mnist_infer +\end{lstlisting} + +This command compiles the Rust project in release mode. + +The options used are: + +\begin{itemize} + \item \texttt{--release}: Builds the application with compiler optimizations enabled. + \item \texttt{-p mnist\_infer}: Specifies that only the \texttt{mnist\_infer} package should be compiled. +\end{itemize} + +The generated executable is stored in: + +\begin{lstlisting} +/app/rust_ml/target/release/mnist_infer +\end{lstlisting} + +\subsubsection{Runtime Stage} + +The second stage begins with: + +\begin{lstlisting} +FROM nvidia/vulkan:1.3-470 +\end{lstlisting} + +This stage uses an NVIDIA Vulkan runtime image as the base image. The purpose of using this image is to provide Vulkan-related runtime libraries and GPU compatibility for applications that may rely on Vulkan acceleration. + +Compared to the builder image, this runtime image is significantly smaller because it does not contain compilation tools, Rust compilers, or source code. + + +\paragraph{Runtime Working Directory} + +\begin{lstlisting} +WORKDIR /app +\end{lstlisting} + +This sets the runtime working directory to: + +\begin{lstlisting} +/app +\end{lstlisting} + +All runtime files are placed relative to this location. + +\paragraph{Copying the Compiled Binary} + +\begin{lstlisting} +COPY --from=builder /app/rust_ml/target/release/mnist_infer /app/binary +\end{lstlisting} + +This instruction copies the compiled executable from the builder stage into the runtime image. + +The \texttt{--from=builder} option tells Docker to retrieve the file from the stage named \texttt{builder}. + +The binary is renamed from: + +\begin{lstlisting} +mnist_infer +\end{lstlisting} + +to: + +\begin{lstlisting} +/app/binary +\end{lstlisting} + +inside the runtime container. + +\paragraph{Copying the Model File} + +\begin{lstlisting} +COPY ./model/mnist_rust/model.mpk /app/model/mnist_rust/model.mpk +\end{lstlisting} + +This instruction copies the trained model file into the runtime container. + +The model file is stored at: + +\begin{lstlisting} +/app/model/mnist_rust/model.mpk +\end{lstlisting} + +The application can later load this file during inference. + +\paragraph{Environment Variables} + +Two environment variables are defined: + +\begin{lstlisting} +ENV RUST_LOG=info +ENV MODEL_PATH=/app/model/mnist_rust/model.mpk +\end{lstlisting} + +Their purposes are: + +\begin{itemize} + \item \texttt{RUST\_LOG=info}: Enables logging at the info level. + \item \texttt{MODEL\_PATH}: Stores the path to the trained model file. +\end{itemize} + +Using environment variables makes the application more flexible because configuration values can be changed without modifying the source code. + +\paragraph{Exposing the Application Port} + +\begin{lstlisting} +EXPOSE 9050 +\end{lstlisting} + +This instruction documents that the containerized application listens on port 9050. + +Although \texttt{EXPOSE} does not automatically publish the port to the host system, it informs users and orchestration tools such as Docker Compose or Kubernetes which port should be mapped. + +\paragraph{Container Startup Command} + +\begin{lstlisting} +CMD ["./binary"] +\end{lstlisting} + +This instruction defines the default command executed when the container starts. + +The compiled Rust binary is launched directly from the runtime working directory. + +\subsubsection{Advantages of the Dockerfile Design} + +This Dockerfile provides several important advantages: + +\begin{itemize} + \item Reduced final image size through multi-stage builds. + \item Separation of build dependencies and runtime dependencies. + \item Improved security because the runtime image does not contain compilers or source code. + \item Faster deployment due to a lightweight runtime container. + \item Better portability because the same container can run consistently across different environments. + \item Easier maintenance through the use of environment variables and explicit working directories. +\end{itemize} + +Overall, this Dockerfile is designed to efficiently package the Rust-based MNIST inference application for deployment while minimizing runtime overhead and maintaining reproducibility. + +\subsection{Python (PyTorch) Dockerfile} + +This section details the image optimization strategy implemented for the MNIST inference container. The core approach minimizes the Docker image size by decoupling the heavy machine learning dependencies (PyTorch, etc.) from the application container. Instead of baking these libraries into the image, they are stored on an external volume (NFS share) and mounted at runtime. + +\subsubsection{Dockerfile Analysis} + +The \texttt{Dockerfile} is kept intentionally lightweight. By excluding large dependencies like \texttt{torch} from the \texttt{pip install} command, the image size remains very small (only containing the base Python runtime and lightweight web frameworks). + +\vspace{0.5em} +\noindent\textbf{Listing 1: Optimized Inference Dockerfile} +\label{lst:dockerfile_inference} +\vspace{0.3em} + +\begin{lstlisting}[language=Dockerfile] +FROM python:3.12-slim + +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV OMP_NUM_THREADS=1 +ENV MKL_NUM_THREADS=1 + +# Critical: Point Python to the external volume +ENV PYTHONPATH=/external-libs/ml_env/lib/python3.12/site-packages + +WORKDIR /app + +# Only install lightweight app dependencies +RUN pip install --no-cache-dir --upgrade pip && \ + pip install fastapi==0.110.0 uvicorn==0.29.0 python-multipart==0.0.9 + +COPY app.py model.py model.pt ./ + +EXPOSE 8000 + +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"] +\end{lstlisting} + +\begin{itemize} + \item \textbf{Base Image:} Uses \texttt{python:3.12-slim} to minimise the OS footprint. + \item \textbf{Environment Configuration:} + \begin{itemize} + \item \texttt{PYTHONDONTWRITEBYTECODE=1}: Prevents Python from writing \texttt{.pyc} files to disk. + + \item \textbf{\texttt{PYTHONPATH}}: Crucially set to + \path{/external-libs/ml_env/lib/python3.12/site-packages}. + This instructs the Python interpreter to look for libraries in the mounted volume directory, not just the default system paths. + \end{itemize} + \item \textbf{Minimal Dependencies:} The \texttt{pip install} command only installs \texttt{fastapi}, \texttt{uvicorn}, and \texttt{python-multipart}. Heavy ML libraries are assumed to be present in the mounted volume. +\end{itemize} + +\subsubsection{Volume Mounting Strategy} + +The strategy relies on two shell scripts to set up the environment on the host machine and run the container with the correct volume mappings. + +\paragraph{Library Setup (\texttt{mount\_libs.sh})} +This script runs on the host machine (or a VM node) to prepare the shared library volume. +\begin{enumerate} + \item \textbf{NFS Client Installation:} It installs \texttt{nfs-common} to enable Network File System capabilities. + \item \textbf{Mounting:} It connects to a remote NFS server (\texttt{172.16.203.14}) where the pre-installed ML libraries reside. + \item \textbf{Local Path:} The remote libraries are mounted to \texttt{/mnt/ml-libs} on the host. This directory acts as the bridge between the NFS server and the Docker container. +\end{enumerate} + +\paragraph{Runtime Execution (\texttt{run\_container.sh})} +This script launches the Docker container with the necessary runtime configurations to access the external libraries. + +\vspace{0.5em} +\noindent\textbf{Listing 2: Container Execution Command} +\label{lst:docker_run} +\vspace{0.3em} + +\begin{lstlisting}[language=Bash] +docker run -d \ + -v /mnt/ml-libs:/external-libs \ + -e PYTHONPATH=/external-libs/ml_env/lib/python3.12/site-packages \ + -p 8000:8000 \ + fastapi-ml-app +\end{lstlisting} + +\begin{itemize} + \item \textbf{\texttt{-v /mnt/ml-libs:/external-libs}}: This bind mount maps the host's \texttt{/mnt/ml-libs} (which contains the NFS data) to \texttt{/external-libs} inside the container. + \item \textbf{\texttt{-e PYTHONPATH=...}}: explicit environment variable override ensures the container's Python runtime finds the packages in \texttt{/external-libs}. +\end{itemize} + +\subsubsection{Benefits and Optimization} + +\begin{table*}[t!] +\centering +\caption{Optimization Benefits} +\label{tab:docker_optimization} +\begin{tabular}{|l|p{6cm}|p{6cm}|} +\hline +\textbf{Feature} & \textbf{Standard Approach} & \textbf{Volume Mount Approach} \\ \hline +\textbf{Image Size} & \textbf{Huge} ($>2GB$). Includes PyTorch, CUDA binaries, and all dependencies. & \textbf{Tiny} (~100MB). Only contains app code and minimal HTTP libs. \\ \hline +\textbf{Build Time} & \textbf{Slow}. Downloading and installing PyTorch takes minutes. & \textbf{Fast}. setup only installs \texttt{fastapi}. \\ \hline +\textbf{Updates} & requires rebuilding and pushing large layers for every code change. & Code changes only require rebuilding the tiny app layer. Library updates are handled externally. \\ \hline +\end{tabular} +\end{table*} + +This architecture allows for rapid deployment and updating of the application logic without the overhead of moving gigabytes of container layers for unchanged machine learning dependencies. + +\subsection{Rust: CNN Model Architecture and Training Performance} + +The convolutional neural network (CNN) model used for the experiment consisted of two convolutional layers followed by adaptive average pooling, dropout, and two fully connected layers. The complete architecture is shown below: + +\begin{lstlisting} +Model { + conv1: Conv2d {ch_in: 1, ch_out: 8, stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Valid, params: 80} + conv2: Conv2d {ch_in: 8, ch_out: 16, stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Valid, params: 1168} + pool: AdaptiveAvgPool2d {output_size: [8, 8]} + dropout: Dropout {prob: 0.5} + linear1: Linear {d_input: 1024, d_output: 512, bias: true, params: 524800} + linear2: Linear {d_input: 512, d_output: 10, bias: true, params: 5130} + activation: Relu + params: 531178 +} +\end{lstlisting} + +The model was trained for 10 epochs. Over the course of training, both the training and validation performance improved consistently. Training accuracy increased from 81.575\% in the first epoch to 97.300\% in the final epoch, while validation accuracy improved from 92.133\% to 98.517\%. + +Similarly, the training loss decreased significantly from 0.656 to 0.087, and the validation loss reduced from 0.258 to 0.054 by the end of training. The macro F1-score also improved substantially, reaching 96.974\% for training and 98.321\% for validation. + +\begin{table*}[t!] +\centering +\caption{Training and Validation Metrics Summary} +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min.} & \textbf{Epoch} & \textbf{Max.} & \textbf{Epoch} \\ +\hline +Train & Accuracy & 81.575 & 1 & 97.300 & 10 \\ +Train & Loss & 0.087 & 10 & 0.656 & 1 \\ +Train & Precision@Top1 [Macro] & 82.126 & 1 & 97.304 & 10 \\ +Train & Recall@Top1 [Macro] & 81.286 & 1 & 97.232 & 10 \\ +Train & F1-Score@Top1 [Macro] & 79.715 & 1 & 96.974 & 10 \\ +Train & Top-5 Accuracy & 97.696 & 1 & 99.969 & 10 \\ +Train & CPU Memory (GB) & 2.514 & 2 & 2.927 & 10 \\ +Train & CPU Usage (\%) & 20.753 & 5 & 30.394 & 10 \\ +\hline +Valid & Accuracy & 92.133 & 1 & 98.517 & 10 \\ +Valid & Loss & 0.054 & 10 & 0.258 & 1 \\ +Valid & Precision@Top1 [Macro] & 92.154 & 1 & 98.527 & 10 \\ +Valid & Recall@Top1 [Macro] & 91.978 & 1 & 98.425 & 10 \\ +Valid & F1-Score@Top1 [Macro] & 91.176 & 1 & 98.321 & 10 \\ +Valid & Top-5 Accuracy & 99.583 & 1 & 99.967 & 10 \\ +Valid & CPU Memory (GB) & 2.514 & 2 & 3.085 & 10 \\ +Valid & CPU Usage (\%) & 20.539 & 2 & 39.652 & 10 \\ +\hline +\end{tabular} +\label{tab:cnn_metrics_summary} +\end{table*} + +The results indicate that the model achieved strong generalization performance with minimal overfitting, as the validation accuracy remained slightly higher than the training accuracy throughout the experiment. The consistently high Top-5 accuracy values further demonstrate that the model was able to correctly identify the correct class within its top predictions. + +It should also be noted that the execution terminated with a segmentation fault after training completion. However, since the fault occurred after all epochs had been completed and metrics had already been recorded, it did not affect the validity of the training results. \\ + +The time taken to train the model is 229.916s (3min 49.916s). + + + +\subsection{Python: CNN Model Architecture and Training Performance} + +The convolutional neural network (CNN) model implemented in Python mirrors the Rust architecture, consisting of two convolutional layers followed by adaptive average pooling, dropout regularization, and two fully connected layers. + +\textbf{Model Architecture:} + +\begin{lstlisting} +Model { + conv1: Conv2d {ch_in: 1, ch_out: 8, kernel_size: [3, 3], stride: [1, 1]} + conv2: Conv2d {ch_in: 8, ch_out: 16, kernel_size: [3, 3], stride: [1, 1]} + pool: AdaptiveAvgPool2d {output_size: [8, 8]} + dropout: Dropout {prob: 0.5} + linear1: Linear {d_input: 1024, d_output: 512, params: 524800} + linear2: Linear {d_input: 512, d_output: 10, params: 5130} + activation: ReLU + total params: 531178 +} +\end{lstlisting} + +The model was trained for 10 epochs. Training and validation performance improved consistently over time. + +Training accuracy increased from 82.01\% to 97.29\%, while validation accuracy improved from 92.87\% to 98.20\%. +Training loss decreased significantly from 0.5947 to 0.0867, and validation loss reduced from 0.2475 to 0.0579. + +The macro F1-score reached 0.9814, demonstrating strong classification performance. Additionally, the Top-5 accuracy achieved 99.98\%, indicating highly reliable predictions. + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min} & \textbf{Epoch} & \textbf{Max} & \textbf{Epoch} \\ +\hline + +Train & Accuracy & 82.01 & 1 & 97.29 & 10 \\ +Train & Loss & 0.0867 & 10 & 0.5947 & 1 \\ +Train & Precision@Top1 [Macro] & -- & -- & -- & -- \\ +Train & Recall@Top1 [Macro] & -- & -- & -- & -- \\ +Train & F1-Score@Top1 [Macro] & -- & -- & -- & -- \\ +Train & Top-5 Accuracy & -- & -- & -- & -- \\ +Train & CPU Memory (GB) & 0.84 & -- & 0.98 & -- \\ +Train & CPU Usage (\%) & 72.1 & -- & 92.0 & -- \\ +\hline + +Valid & Accuracy & 92.87 & 1 & 98.20 & 10 \\ +Valid & Loss & 0.0579 & 10 & 0.2475 & 1 \\ +Valid & Precision@Top1 [Macro] & 0.9816 & -- & 0.9816 & -- \\ +Valid & Recall@Top1 [Macro] & 0.9812 & -- & 0.9812 & -- \\ +Valid & F1-Score@Top1 [Macro] & 0.9814 & -- & 0.9814 & -- \\ +Valid & Top-5 Accuracy & 99.98 & -- & 99.98 & -- \\ +Valid & CPU Memory (GB) & 0.84 & -- & 0.98 & -- \\ +Valid & CPU Usage (\%) & 72.1 & -- & 92.0 & -- \\ +\hline + +\end{tabular} +\caption{Python Training and Validation Metrics Summary} +\end{table*} + +The results indicate strong generalization performance with no signs of overfitting. Validation accuracy remained consistently high and closely followed training accuracy. + +Training was stable with zero NaN events observed. The total training time was 182.23 seconds, with an average epoch time of 15.05 seconds and an average iteration speed of 50.58 iterations per second. + + + +\newpage + +\subsection{Container Comparison: Python vs Rust} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 3.98GB & 1.02GB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table*} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + +\subsection{Model Size Comparison} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table*} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + +\clearpage + +\section{Task: Regression} + +\subsection{Introduction} +This document outlines the architectural, mathematical, and deployment specifics of implementing a Neural Network-based Regression model across two disparate machine learning environments: Rust (utilizing the Burn framework) and Python (utilizing PyTorch). It covers the distinct model architecture decisions, dataset handling strategies, and specialized pipeline deployment techniques leveraging Network File Systems (NFS) mapping via Docker bounds. + +\subsection{Model Architecture and Mathematical Formulation} + +\subsubsection{Mathematical Foundation} +The core mathematical foundation deployed across both frameworks is a classical Feed-Forward Neural Network consisting of a single hidden dimension mapping inputs directly onto a continuous single-variable regression output. + +For a given input feature vector $X \in \mathbb{R}^N$ (where $N$ dictates the feature size depending on the target dataset), the network's forward transformation can be represented sequentially as: +\begin{align} + Z_1 &= X \cdot W_1^T + b_1 \quad &\text{(Input Projection)} \\ + A_1 &= \max(0, Z_1) \quad &\text{(ReLU Activation)} \\ + \hat{Y} &= A_1 \cdot W_2^T + b_2 \quad &\text{(Output Projection)} +\end{align} + +Where: +\begin{itemize} + \item $W_1 \in \mathbb{R}^{H \times N}$ and $b_1 \in \mathbb{R}^H$ map the inputs onto the hidden vector space $H$. + \item $\max(0, \cdot)$ denotes the Non-Linear Rectified Linear Unit (ReLU) mapping algorithm. + \item $W_2 \in \mathbb{R}^{1 \times H}$ and $b_2 \in \mathbb{R}$ collapse the hidden abstraction onto the finalized regression scalar prediction $\hat{Y}$. +\end{itemize} + +\subsubsection{Architectural Configurations} +While the mathematical foundations are identical, implementations slightly differ based on dataset selections within the modules: +\begin{itemize} + \item \textbf{PyTorch Architecture:} Configures $N=13$ input features mapping to $H=64$ hidden parameters. + \item \textbf{Rust (Burn) Architecture:} Configures $N=8$ input features concurrently mapping to $H=64$ hidden parameters. +\end{itemize} +In both configurations, standard parameter biases (`bias=True`) are included and automatically initialized. + +\subsection{Training Pipelines} + +Both codebases train the model iteratively tracking gradients via the Adam optimizer scaled against Mean Squared Error (MSE) loss logic: +\[ \text{MSE} = \frac{1}{B} \sum_{i=1}^{B} (Y_i - \hat{Y}_i)^2 \] + +\subsubsection{PyTorch Context} +\begin{itemize} + \item \textbf{Data Loading:} Automatically pulls the \textbf{Boston Housing} dataset array (.npz file) from an external Google API via `urllib` and manually partitions it down into an explicit $80/20$ split. + \item \textbf{Telemetry Metrics:} Generates explicit hardware tracking loops inside the main epoch runner. Uses the `psutil` library to compute and stream epoch `iteration\_speed`, raw RAM consumption, and `cpu\_temp` hardware sensors parallel to the loss parameters. +\end{itemize} + +\subsubsection{Rust (Burn) Context} +\begin{itemize} + \item \textbf{Data Loading:} Links into Huggingface's dataset registry asynchronously targeting the \textbf{California Housing} SQLized splits mapping onto memory arrays via localized `HousingDistrictItem` structs. + \item \textbf{Normalization Mapping:} Computes spatial min-max normalizations programmatically over inputs during training: + \[ X_{norm} = \frac{X - \text{min}}{\text{max} - \text{min}} \] + This logic restricts features within standard boundaries precluding exploding gradient derivations. +\end{itemize} + +\subsection{Inference Pipeline and Docker NFS Integration} + +Deploying these isolated pipelines necessitates radically different execution strategies, highlighting Python's heavyweight runtime dependency bottlenecks versus Rust's compile-time optimizations. + +\subsubsection{PyTorch Inference Architecture} +Standard PyTorch Docker environments routinely eclipse several gigabytes due to CUDA bindings and generic scientific computation loops. To circumvent this inside microservices, the PyTorch inference pipeline mandates a hybrid Network File System (NFS) mapping architecture: +\begin{enumerate} + \item \textbf{NFS Mounting (\texttt{mount\_libs.sh}):} Installs an external `nfs-common` client locally and binds the extensive python library volume from an external dedicated storage server (`172.16.203.14`) into the host machine's `/mnt/LSTM-libs` map. + \item \textbf{Lightweight Container Image:} The backend \texttt{Dockerfile} avoids `pip install` commands completely, simply initializing a barebone `nvidia/cuda:12.1.1` image mapping Python $3.11$ system links. + \item \textbf{Volume Inject (\texttt{run\_container.sh}):} The script initializes the container enforcing `-v` flags that sync the NFS `/mnt/LSTM-libs` directory seamlessly onto the Docker's `/external-libs`. Crucially, it overrides the system \texttt{PYTHONPATH} to target those external `site-packages` at runtime. + \item \textbf{Execution:} The `FastAPI` instance loads, bypasses massive disk pulls, links the models iteratively, and fields inbound `HousingFeatures` lists continuously. +\end{enumerate} + +\subsubsection{Rust (Burn) Inference Architecture} +Rust handles Docker microservices inherently via statically linked deployments: +\begin{itemize} + \item \textbf{Multi-Stage Compiling:} Executes a build phase operating within an oversized `rust:1.92-alpine` chain, ejecting the resulting binary onto an isolated stripped `alpine:3.23` environment structure. + \item \textbf{Native Routing:} Utilizes \texttt{Axum} servers to establish the HTTP logic endpoints securely routing JSON payloads mapping to specific feature names (e.g. \texttt{median\_income}, \texttt{house\_age}). +\end{itemize} + +\subsection{Rust: Regression Model Performance Analysis} + +The regression model used in this experiment was a simple feed-forward neural network consisting of one hidden layer followed by an output layer. The model was designed to predict the median house value based on eight input features. + +\subsubsection{Model Architecture} + +The architecture of the regression model is shown below: + +\begin{lstlisting} +RegressionModel { + input_layer: Linear {d_input: 8, d_output: 64, bias: true, params: 576} + output_layer: Linear {d_input: 64, d_output: 1, bias: true, params: 65} + activation: Relu + params: 641 +} +\end{lstlisting} + +The model contains: + +\begin{itemize} + \item An input layer that maps 8 input features to 64 hidden units + \item A ReLU activation function applied after the hidden layer + \item An output layer that maps the 64 hidden units to a single scalar value +\end{itemize} + +The total number of trainable parameters in the model was only 641, making it a lightweight model suitable for fast training and inference. + +\subsubsection{Training Configuration} + +The model was trained for 100 epochs. A constant learning rate of: + +\[ +1.0 \times 10^{-3} +\] + +was used throughout the entire training process. + +\subsubsection{Training Performance} + +The training loss decreased substantially over the 100 epochs. Initially, the model started with a training loss of 3.086 during the first epoch. By the final epoch, the loss had reduced to 0.414. + +This significant reduction in loss indicates that the model successfully learned the underlying relationship between the input features and the target variable. + +\subsubsection{Validation Performance} + +Validation loss also showed a considerable improvement during training. The validation loss decreased from 4.132 in the first epoch to a minimum of 0.635 at epoch 51. + +The difference between the final training loss and the minimum validation loss suggests that the model achieved good generalization performance without severe overfitting. + +\begin{table*}[t!] +\centering +\caption{Training and Validation Metrics Summary for the Regression Model} +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min.} & \textbf{Epoch} & \textbf{Max.} & \textbf{Epoch} \\ +\hline +Train & Loss & 0.414 & 100 & 3.086 & 1 \\ +Train & Learning Rate & $1.0 \times 10^{-3}$ & 1 & $1.0 \times 10^{-3}$ & 100 \\ +Train & CPU Memory (GB) & 2.125 & 4 & 2.325 & 56 \\ +Train & CPU Usage (\%) & 19.539 & 54 & 37.989 & 11 \\ +\hline +Valid & Loss & 0.635 & 51 & 4.132 & 1 \\ +Valid & CPU Memory (GB) & 2.124 & 3 & 2.325 & 55 \\ +Valid & CPU Usage (\%) & 19.550 & 54 & 37.960 & 11 \\ +\hline +\end{tabular} +\label{tab:regression_metrics_summary} +\end{table*} + +\subsubsection{Prediction Example} + +A sample prediction generated by the model is shown below: + +\begin{lstlisting} +Predicted 2.021734 Expected 2.158 +\end{lstlisting} + +The predicted value is reasonably close to the expected value, indicating that the model was able to approximate the target variable with acceptable accuracy. + +Since the median house value was measured in units of 100,000 dollars, the prediction corresponds to: + +\begin{itemize} + \item Predicted value: approximately 202,173 dollars + \item Expected value: approximately 215,800 dollars +\end{itemize} + +\subsubsection{Predicted vs. Expected Distribution} + +The predicted-versus-expected plot suggests that the model captures the general trend in the target values, although some prediction errors remain for certain samples. + +Most of the predicted values appear concentrated around the central region of the distribution, indicating that the model performs better on common house value ranges than on extreme values. + +\subsubsection{Resource Utilization} + +The model required relatively little memory during execution. Training memory usage ranged from 2.125 GB to 2.325 GB, while validation memory usage ranged from 2.124 GB to 2.325 GB. + +CPU utilization remained moderate throughout training. Training CPU usage ranged from 19.539\% to 37.989\%, while validation CPU usage ranged from 19.550\% to 37.960\%. + +CPU temperature values were unavailable and therefore recorded as NaN. + +\subsubsection{Execution Time and Failure} + +The complete training and evaluation process required: + +\begin{itemize} + \item Real time: 3 minutes and 18.257 seconds + \item User CPU time: 4 minutes and 13.554 seconds + \item System CPU time: 50.340 seconds +\end{itemize} + +\subsection{Language Specific Implementation Details} + +\subsubsection{PyTorch-Specific Paradigms} +\begin{itemize} + \item \textbf{Thread Clamping:} Due to inference optimization restrictions (especially running CPU variations alongside container structures), the `app.py` enforces explicit core binding calls via `torch.set\_num\_threads(1)` and `torch.set\_num\_interop\_threads(1)` securing computational resources and restricting OS context-switching overheads. + \item \textbf{Matrix Array Verifications:} Manually inspects raw matrix vector mappings validating dimensions dynamically against numeric constraints: \texttt{len(x) != NUM\_FEATURES} triggering runtime panics before pipeline evaluations fail. + \item \textbf{Manual Hardware Moving:} The framework is heavily littered with required `.to(device)` mapping configurations switching inputs, datasets, targets, and models manually between the host and external components. +\end{itemize} + +\subsubsection{Rust (Burn)-Specific Paradigms} +\begin{itemize} + \item \textbf{Generic Compile-Time Shapes:} Dimension mappings and tensor validations are fundamentally enforced inside the Rust compiler boundaries via `` arrays indicating batches of distinct input structures mapping to `targets: Tensor`. Invalid sizes fail compilation, voiding the requirement for manual PyTorch matrix validations. + \item \textbf{Struct Batching Protocols:} Inference doesn't evaluate primitive float arrays. Intead, the API relies on executing an overarching `HousingBatcher` which transforms specific struct domains (\texttt{HousingDistrictItem}) safely into tensor primitives while executing implicit `self.normalizer.to\_device(device)` logic silently against constants behind boundaries. + \item \textbf{Record Deserialization:} States are strictly detached from models via standard `.mpk` maps. They invoke explicit \texttt{NoStdTrainingRecorder::new().load()} tracking traits unbinding memory limits inherent to standard dict serialization configurations natively loaded via `RegressionModelConfig`. +\end{itemize} + +\newpage +\subsection{Python: Regression Model Architecture and Training Performance} + +The regression model used in this experiment is a lightweight fully connected neural network with a small number of parameters (961 total). The model is optimized using mean squared error loss. + + + +The model was trained for 100 epochs. Training loss decreased significantly from 8265.55 to 69.86 (99.15\% reduction), while validation loss decreased from 9045.34 to 55.70. + +Despite strong loss reduction, the model struggled to achieve good generalization. The validation $R^2$ score remained negative (-1.07), indicating that the model performs worse than a simple baseline predictor. + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min} & \textbf{Epoch} & \textbf{Max} & \textbf{Epoch} \\ +\hline + +Train & Loss & 68.50 & 100 & 8265.55 & 1 \\ +Train & RMSE & 8.39 & 100 & 91.53 & 1 \\ +Train & MAE & 6.29 & 100 & 90.33 & 1 \\ +Train & R$^2$ & -96.93 & 1 & 0.1767 & 100 \\ +Train & Grad Norm (Total) & 118.80 & -- & 112876.03 & 1 \\ +Train & Iteration Speed (it/s) & 0.69 & 1 & 14.39 & 54 \\ +Train & CPU Memory (GB) & 0.87 & -- & 1.02 & -- \\ +Train & CPU Usage (\%) & 45.7 & -- & 97.6 & -- \\ +\hline + +Valid & Loss & 50.93 & 65 & 9045.34 & 1 \\ +Valid & RMSE & 7.14 & 65 & 95.11 & 1 \\ +Valid & MAE & 6.00 & 65 & 93.52 & 1 \\ +Valid & R$^2$ & -335.40 & 1 & -0.8940 & 65 \\ +Valid & Iteration Speed (it/s) & 0.69 & 1 & 14.39 & 54 \\ +Valid & CPU Memory (GB) & 0.87 & -- & 1.02 & -- \\ +Valid & CPU Usage (\%) & 45.7 & -- & 97.6 & -- \\ +\hline + +\end{tabular} +\caption{Regression Model Training and Validation Metrics Summary} +\end{table*} + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 47.39 seconds + \item Average Epoch Time: 0.225 seconds + \item Iteration Speed (Mean): 10.36 it/s + \item Gradient Norm (Mean): 7981.31 + \item NaN Events: 0 + \item Convergence: Non-monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + +The results indicate that while optimization was successful in reducing loss, the model lacks sufficient capacity or feature representation to generalize well. The persistently negative validation $R^2$ suggests underfitting or a mismatch between model complexity and data characteristics. + + + + +\newpage + +\subsection{Container Comparison: Python vs Rust} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 3.98GB & 973MB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table*} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + + +\subsection{Model Size Comparison} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table*} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + +\clearpage + +\section{Task: Text Classification (AG News)} +\subsection{Model Architecture and Training Strategy} + +The text classification system is built using the \texttt{Burn} framework in Rust, leveraging a Transformer-based architecture for feature extraction and a linear classification head. This section details the mathematical formulation of the model and the strategy employed for training. + +\subsubsection{Model Architecture} +The core of the model is a Transformer Encoder, which processes a sequence of token embeddings to capture contextual relationships. The architecture consists of three primary stages: embedding, encoding, and classification. + +\paragraph{Embedding Layer} +Input text is tokenized and converted into a sequence of indices $X \in \mathbb{N}^{B \times L}$, where $B$ is the batch size and $L$ is the sequence length. The model utilizes two parallel embedding layers: +\begin{enumerate} + \item \textbf{Token Embedding ($E_{tok}$)}: Maps token indices to dense vectors of dimension $d_{model}$. + \item \textbf{Positional Embedding ($E_{pos}$)}: Maps position indices $[0, \dots, L-1]$ to dense vectors of dimension $d_{model}$ to inject sequence order information. +\end{enumerate} + +The final embedding representation $E$ is obtained by averaging the token and positional embeddings: +\begin{equation} + E = \frac{E_{tok}(X) + E_{pos}(\text{positions})}{2} +\end{equation} + +\paragraph{Transformer Encoder} +The embedding tensor $E$ is passed through a multi-layer Transformer Encoder. Each layer consists of a Multi-Head Self-Attention (MHSA) mechanism followed by a Position-wise Feed-Forward Network (FFN), with residual connections and layer normalization. + +The configuration used in this implementation is as follows: +\begin{itemize} + \item \textbf{Model Dimension ($d_{model}$)}: 256 + \item \textbf{Feed-Forward Dimension ($d_{ff}$)}: 1024 + \item \textbf{Number of Heads ($N_{heads}$)}: 8 + \item \textbf{Number of Layers ($N_{layers}$)}: 4 + \item \textbf{Normalization}: Layer norm applied before sub-layers (Pre-Norm). +\end{itemize} + +Let $H = \text{TransformerEncoder}(E)$, where $H \in \mathbb{R}^{B \times L \times d_{model}}$ represents the contextualized representations of the input sequence. + +\paragraph{Classification Head} +For classification, the model utilizes the representation of the first token (typically acting as the [CLS] token) from the encoded sequence. This vector is passed through a linear layer to project it into the class space: +\begin{equation} + Y = \text{Linear}(H_{[:, 0, :]}) +\end{equation} +where $Y \in \mathbb{R}^{B \times N_{classes}}$ represents the logits. For inference, a Softmax function is applied to obtain probabilities: +\begin{equation} + \hat{P} = \text{Softmax}(Y) +\end{equation} + +\begin{table*}[t!] +\centering +\caption{Model Architecture Summary} +\label{tab:model_arch} +\begin{tabular}{|l|l|c|c|} +\hline +\textbf{Component} & \textbf{Configuration / Details} & \textbf{Input Shape} & \textbf{Output Shape} \\ \hline +Token Embedding & $V \to d_{model}$ ($V$: Vocab Size) & $(B, L)$ & $(B, L, 256)$ \\ \hline +Pos Embedding & $L_{max} \to d_{model}$ & $(B, L)$ & $(B, L, 256)$ \\ \hline +Embedding Merge & Average ($E_{tok} + E_{pos}$) & - & $(B, L, 256)$ \\ \hline +Transformer Block & 4 Layers, 8 Heads, $d_{ff}=1024$ & $(B, L, 256)$ & $(B, L, 256)$ \\ \hline +Feature Extract & Slice First Token (Index 0) & $(B, L, 256)$ & $(B, 256)$ \\ \hline +Classifier Head & Linear ($256 \to N_{classes}$) & $(B, 256)$ & $(B, N_{classes})$ \\ \hline +\end{tabular} +\end{table*} + + + +\subsection{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{RustLocust/text.pdf} + + +\subsection{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{PythonLocust/text.pdf} + + +\subsubsection{Training Strategy} +The model is trained using a supervised learning approach with the following configuration: + +\paragraph{Loss Function} +The training objective is to minimize the Cross-Entropy Loss between the predicted logits $Y$ and the ground truth class labels $C$: +\begin{equation} + \mathcal{L} = \text{CrossEntropy}(Y, C) = -\sum_{c=1}^{N_{classes}} \mathbb{1}_{c=C} \log\left(\frac{e^{Y_c}}{\sum_{j} e^{Y_j}}\right) +\end{equation} + +\paragraph{Optimization} +We employ the **Adam** optimizer with the following parameters: +\begin{itemize} + \item \textbf{Weight Decay}: $5 \times 10^{-5}$ + \item \textbf{Beta Coefficients}: Standard defaults (typically $\beta_1=0.9, \beta_2=0.999$) +\end{itemize} + +\paragraph{Learning Rate Scheduling} +A **Noam Learning Rate Scheduler** is used to stabilize training. The learning rate increases linearly during a warmup phase and then decays proportionally to the inverse square root of the step number. +\begin{equation} +\begin{aligned} +LR &= d_{model}^{-0.5} \cdot \min( \\ + &\quad step\_num^{-0.5}, \\ + &\quad step\_num \cdot warmup\_steps^{-1.5} +) +\end{aligned} +\end{equation} +\begin{itemize} + \item \textbf{Warmup Steps}: 1000 + \item \textbf{Base Learning Rate}: 0.01 +\end{itemize} + +\paragraph{Metrics} +During training and validation, the following metrics are tracked to monitor performance: +\begin{itemize} + \item \textbf{Loss}: Cross-Entropy Loss. + \item \textbf{Accuracy}: Percentage of correct predictions. + \item \textbf{F1-Score, Precision, Recall}: Macro-averaged metrics to account for class balance. +\end{itemize} + +\subsection{Burn Code Specifications} + +This section outlines the significant implementation details of the text classification system, focusing on the architectural choices in \texttt{model.rs} and the robust training pipeline defined in \texttt{training.rs}. +\subsubsection{Model Implementation (\texttt{model.rs})} +The \texttt{TextClassificationModel} leverages the \textbf{Burn} framework's modular design to implement a Transformer-based classifier. Key features of this implementation include: +\begin{itemize} + \item \textbf{Dual Embedding Strategy:} The model employs two distinct embedding layers: \texttt{embedding\_token} for semantic content and \texttt{embedding\_pos} for positional information. A unique characteristic of this implementation is the fusion strategy, where these embeddings are combined via averaging: + \[ + E_{final} = \frac{E_{pos} + E_{token}}{2} + \] + This differs from the standard summation approach often found in BERT implementations, potentially stabilizing the initial magnitude of the embedding vectors. + + \item \textbf{Configurable Architecture:} The system uses a \texttt{TextClassificationModelConfig} struct derived with the \texttt{Config} macro. This allows for type-safe and serializable hyperparameter management, ensuring the model architecture (hidden size, vocabulary size, sequence length) can be easily saved, loaded, and reproducible. + + \item \textbf{Masked Attention:} The forward pass actively utilizes padding masks (\texttt{mask\_pad}). These masks are passed into the \texttt{TransformerEncoderInput}, ensuring that the self-attention mechanism strictly ignores padding tokens, which is critical for handling variable-length text sequences correctly. + + \item \textbf{Separation of Train and Inference Logic:} The model explicitly implements the \texttt{TrainStep} and \texttt{InferenceStep} traits. + \begin{itemize} + \item \textbf{Training:} Returns a \texttt{ClassificationOutput} struct containing the calculated Cross-Entropy loss for backpropagation. + \item \textbf{Inference:} Returns raw probabilities by applying a softmax activation on the output logits, facilitating direct class prediction. + \end{itemize} +\end{itemize} +\subsubsection{Training Pipeline (\texttt{training.rs})} +The training module is designed for reliability and comprehensive observability. It integrates advanced optimization techniques and hardware-aware monitoring. +\begin{itemize} + \item \textbf{Noam Scheduler:} Transformer models are notoriously sensitive to learning rates. The code implements the \textbf{Noam Learning Rate Scheduler} (popularized by "Attention Is All You Need"), which features a linear warmup phase (1000 steps) followed by an inverse square root decay based on the model dimension ($d_{model}$). This prevents gradient explosions during early training stages. + + \item \textbf{Distributed Training Support:} The implementation explicitly handles distributed computing scenarios. It utilizes Rust's feature flags (\texttt{cfg[feature = "ddp"]}) to switch between single-device training and \textbf{Distributed Data Parallel (DDP)} strategies. When enabled, it employs a tree-based \texttt{AllReduceStrategy} for synchronizing gradients across multiple GPUs or nodes. + + \item \textbf{Comprehensive Telemetry:} The training loop is instrumented with an extensive suite of metrics beyond simple accuracy. It tracks: + \begin{itemize} + \item \textbf{Classification Metrics:} Macro-averaged F1-Score, Precision, and Recall, providing a holistic view of model performance on imbalanced datasets. + \item \textbf{Hardware Diagnostics:} CPU temperature, memory usage, and utilization are logged alongside training progress, aiding in the detection of thermal throttling or memory leaks during long training runs. + \end{itemize} + + \item \textbf{Efficient Data Sampling:} To manage large datasets efficiently, the loader utilizes a \texttt{SamplerDataset}. This limits the effective epoch size to 50,000 training samples and 5,000 validation samples, allowing for rapid iteration and feedback loops without needing to process the entire corpus in every epoch. +\end{itemize} + +\subsubsection{Conditional Compilation} +I think we should document a bit about this. + +\subsection{Rust Docker image} + +\subsection{Rust Inference Code} + +\subsection{Rust: Transformer-Based Text Classification Model Performance} + +The text classification model used in this experiment was based on a Transformer encoder architecture. The model consisted of token embeddings, positional embeddings, a multi-layer Transformer encoder, and a final linear classification layer. + +\subsubsection{Model Architecture} + +The architecture of the model is shown below: + +\begin{lstlisting} +TextClassificationModel { + transformer: TransformerEncoder { + d_model: 256, + d_ff: 1024, + n_heads: 8, + n_layers: 4, + dropout: 0.1, + norm_first: true, + quiet_softmax: true, + params: 3159040 + } + embedding_token: Embedding { + n_embedding: 28996, + d_model: 256, + params: 7422976 + } + embedding_pos: Embedding { + n_embedding: 256, + d_model: 256, + params: 65536 + } + output: Linear { + d_input: 256, + d_output: 4, + bias: true, + params: 1028 + } + n_classes: 4 + params: 10648580 +} +\end{lstlisting} + +The Transformer encoder used four encoder layers with eight attention heads per layer. Each layer had a model dimension of 256 and a feed-forward dimension of 1024. A dropout rate of 0.1 was used to reduce overfitting. + +The token embedding layer mapped a vocabulary of 28,996 tokens into 256-dimensional vectors. Positional embeddings of length 256 were also used so that the Transformer could capture token order information. + +The final output layer mapped the Transformer representation into four output classes. + +The total number of trainable parameters in the model was 10,648,580. + +\subsubsection{Training Configuration} + +The model was trained for a total of 5 epochs. During training, the learning rate decayed from $1.107 \times 10^{-5}$ in the first epoch to $3.733 \times 10^{-6}$ in the final epoch. + +\subsubsection{Training Performance} + +Training accuracy improved steadily from 57.968\% in the first epoch to 81.474\% in the fifth epoch. Similarly, the training loss decreased from 0.981 to 0.507. + +The macro precision, recall, and F1-score also improved significantly during training: + +\begin{itemize} + \item Precision increased from 58.893\% to 81.606\% + \item Recall increased from 57.741\% to 81.334\% + \item F1-score increased from 50.201\% to 76.001\% +\end{itemize} + +These results indicate that the model learned meaningful semantic patterns in the text data over successive epochs. + +\subsubsection{Validation Performance} + +Validation performance also improved consistently. Validation accuracy increased from 72.280\% in the first epoch to a maximum of 81.640\% in the fourth epoch. + +The validation loss decreased from 0.731 to 0.507, showing that the model generalized reasonably well to unseen data. + +The validation precision, recall, and F1-score also showed strong improvement: + +\begin{itemize} + \item Precision increased from 72.230\% to 81.739\% + \item Recall increased from 72.258\% to 82.039\% + \item F1-score increased from 65.796\% to 76.509\% +\end{itemize} + +The relatively close values between training and validation accuracy suggest that the model did not suffer from severe overfitting. + +\begin{table*}[t!] +\centering +\caption{Training and Validation Metrics Summary for the Transformer-Based Text Classification Model} +% @kumar please layout sahi kar sakta hai? love you +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min.} & \textbf{Epoch} & \textbf{Max.} & \textbf{Epoch} \\ +\hline +Train & Accuracy & 57.968 & 1 & 81.474 & 5 \\ +Train & Loss & 0.507 & 5 & 0.981 & 1 \\ +Train & Precision@Top1 [Macro] & 58.893 & 1 & 81.606 & 5 \\ +Train & Recall@Top1 [Macro] & 57.741 & 1 & 81.334 & 5 \\ +Train & F1-Score@Top1 [Macro] & 50.201 & 1 & 76.001 & 5 \\ +Train & Learning Rate & $3.733 \times 10^{-6}$ & 5 & $1.107 \times 10^{-5}$ & 1 \\ +Train & CPU Memory (GB) & 2.401 & 2 & 2.736 & 5 \\ +Train & CPU Usage (\%) & 16.529 & 1 & 17.160 & 5 \\ +\hline +Valid & Accuracy & 72.280 & 1 & 81.640 & 4 \\ +Valid & Loss & 0.507 & 5 & 0.731 & 1 \\ +Valid & Precision@Top1 [Macro] & 72.230 & 1 & 81.739 & 5 \\ +Valid & Recall@Top1 [Macro] & 72.258 & 1 & 82.039 & 5 \\ +Valid & F1-Score@Top1 [Macro] & 65.796 & 1 & 76.509 & 5 \\ +Valid & CPU Memory (GB) & 2.263 & 1 & 2.747 & 5 \\ +Valid & CPU Usage (\%) & 20.331 & 1 & 22.190 & 3 \\ +\hline +\end{tabular} +\label{tab:transformer_text_classification_metrics} +\end{table*} + +\subsubsection{Resource Utilization} + +The CPU memory usage remained relatively stable throughout training. Training memory usage ranged from 2.401 GB to 2.736 GB, while validation memory usage ranged from 2.263 GB to 2.747 GB. + +CPU utilization also remained moderate, with training CPU usage ranging from 16.529\% to 17.160\% and validation CPU usage ranging from 20.331\% to 22.190\%. + +CPU temperature values were unavailable during the experiment and therefore recorded as NaN. + +\subsubsection{Execution Time and Failure} + +The complete training run required: + +\begin{itemize} + \item Real time: 32 minutes and 37.872 seconds + \item User CPU time: 36 minutes and 52.313 seconds + \item System CPU time: 1 minute and 41.258 seconds +\end{itemize} + +\subsection{PyTorch Training Pipeline} +This section details the Python implementation of the text classification training pipeline. The code mimics the architecture and logic of the Rust version to ensuring comparable performance and behavior. +\subsubsection{Code Highlights} +\begin{itemize} + \item \textbf{Custom Transformer Model:} + The \texttt{TextClassificationModel} is a custom \texttt{nn.Module} containing: + \begin{itemize} + \item Dual embedding layers (\texttt{embedding\_token} and \texttt{embedding\_pos}). + \item A unique fusion strategy averaging the two embeddings: $E = (E_{pos} + E_{tok}) / 2$. + \item A standard \texttt{TransformerEncoder} stack. + \item A classification head that projects the encoded features to the 4 output classes of the AG News dataset. + \end{itemize} + \item \textbf{Noam Learning Rate Scheduler:} + A custom \texttt{NoamLR} scheduler is implemented to replicate the specific warmup and decay behavior used in the Rust implementation (and the original "Attention Is All You Need" paper). + \[ + lr = \text{factor} \cdot (d_{model}^{-0.5}) \cdot \min(step^{-0.5}, step \cdot warmup^{-1.5}) + \] + This ensures stable training dynamics for the Transformer architecture. + \item \textbf{Dataset Handling:} + The code utilizes the Hugging Face \texttt{datasets} library to load the "ag\_news" dataset. It explicitly shuffles and subsets the data (50,000 train, 5,000 test) to match the constraints applied in the Rust implementation, ensuring a fair apples-to-apples comparison between the two languages. + \item \textbf{Collate Function with Padding Masks:} + A custom \texttt{collate\_fn} handles dynamic batching. It tokenizes text using the \texttt{bert-base-cased} tokenizer and generates a boolean padding mask. Note the inversion logic: PyTorch's \texttt{TransformerEncoder} expects \texttt{True} for padded positions (unlike some other implementations where 1 implies validity), requiring careful mask generation: + \begin{lstlisting} + mask_pad = (encoding['attention_mask'] == 0) + \end{lstlisting} + \item \textbf{Training Loop:} + The training loop is a standard PyTorch implementation using \texttt{tqdm} for progress tracking. It uses \texttt{CrossEntropyLoss} as the criterion and the \texttt{Adam} optimizer. Crucially, the scheduler step is called after every batch (not every epoch), consistent with the Noam schedule requirements. +\end{itemize} + + +\subsection{Python: Text Classification (News) Transformer Model} + +The model is a Transformer-based architecture designed for multi-class news classification. It consists of a multi-layer encoder with multi-head self-attention and feedforward networks. + +\textbf{Model Architecture:} + +\begin{lstlisting} +Model { + transformer_encoder: { + d_model: 256, + nhead: 8, + num_layers: 4, + dim_feedforward: 1024 + } + max_seq_len: 256 + num_classes: 4 + total params: 10649092 +} +\end{lstlisting} + +The model was trained for 5 epochs and showed steady convergence across all evaluation metrics. + +Training accuracy improved from 56.19\% to 79.70\%, while validation accuracy increased from 68.14\% to 79.00\%. +Training loss decreased from 1.0145 to 0.5483, and validation loss reduced from 0.8137 to 0.5628. + +The model achieved a macro F1-score of 0.7903 on the validation/test set, indicating reasonably strong classification performance for a Transformer trained over a small number of epochs. + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min} & \textbf{Epoch} & \textbf{Max} & \textbf{Epoch} \\ +\hline + +Train & Accuracy & 56.19 & 1 & 79.70 & 5 \\ +Train & Loss & 0.5483 & 5 & 1.0145 & 1 \\ +Train & Grad Norm (Total) & 10.39 & 4 & 22.95 & 3 \\ +Train & Iteration Speed (it/s) & 44.33 & 2 & 45.05 & 1 \\ +Train & CPU Memory (GB) & 1.12 & -- & 1.12 & -- \\ +Train & CPU Usage (\%) & 22.2 & -- & 23.5 & -- \\ +\hline + +Valid & Accuracy & 68.14 & 1 & 79.00 & 5 \\ +Valid & Loss & 0.5628 & 5 & 0.8137 & 1 \\ +Valid & Precision@Top1 [Macro] & 0.7187 & 1 & 0.7942 & 5 \\ +Valid & Recall@Top1 [Macro] & 0.6815 & 1 & 0.7899 & 5 \\ +Valid & F1-Score@Top1 [Macro] & 0.6773 & 1 & 0.7903 & 5 \\ +Valid & Iteration Speed (it/s) & 44.33 & 2 & 45.05 & 1 \\ +Valid & CPU Memory (GB) & 1.12 & -- & 1.12 & -- \\ +Valid & CPU Usage (\%) & 22.2 & -- & 23.5 & -- \\ +\hline + +\end{tabular} +\caption{Transformer Text Classification Training and Validation Metrics} +\end{table*} + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 706.99 seconds + \item Average Epoch Time: 139.83 seconds + \item Iteration Speed (Mean): 44.70 it/s + \item Gradient Norm (Mean): 15.75 + \item GPU Memory Usage: 178.79 MB + \item NaN Events: 0 + \item Convergence: Monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + +The results indicate stable training and consistent improvement across epochs. While performance is lower than simpler CNN-based tasks, this is expected due to the increased complexity of natural language understanding tasks. + +\subsection{PyTorch Inference Pipeline Docker Image: Hybrid NFS and Docker Inference Architecture} + +This section details the hybrid deployment strategy designed to optimize Docker image size and leverage a centralized machine learning environment. The architecture splits the responsibilities between a \textbf{Library VM} (storage-heavy) and a \textbf{Docker VM} (compute-centric). + +\subsubsection{Architecture Overview} + +The system comprises two primary components: +\begin{enumerate} + \item \textbf{Library VM (NFS Server)}: Hosts the heavy Python environment, including PyTorch, Transformers, and CUDA libraries. This environment is exported via NFS. + \item \textbf{Docker VM (Inference Client)}: Runs a lightweight Docker container that mounts the external libraries at runtime. +\end{enumerate} + +\subsubsection{Implementation Details} + +\paragraph{1. Library Sharing via NFS} +The Library VM exports the directory containing the Python site-packages. On the Docker VM, this directory is mounted using the \texttt{mount\_libs.sh} script. + +\vspace{0.5em} +\noindent\textbf{Listing 3: Mounting the NFS Library Volume} +\label{lst:nfs_mount} +\vspace{0.3em} + +\begin{lstlisting}[language=bash] +# Configuration from mount_libs.sh +NFS_SERVER_IP="172.16.203.14" +NFS_EXPORT_PATH="/home/iiitb/Documents/textClassificationVolume" +LOCAL_MOUNT_POINT="/mnt/text-libs" + +# Mounting the remote volume +sudo mount -t nfs "$NFS_SERVER_IP:$NFS_EXPORT_PATH" "$LOCAL_MOUNT_POINT" +\end{lstlisting} + +\paragraph{2. Lightweight Docker Image} +The Docker image is built using \texttt{Dockerfile.cpu} and excludes heavy ML libraries. It only contains the application code, the model weights, and minimal system dependencies. + +\vspace{0.5em} +\noindent\textbf{Listing 4: Dockerfile.cpu Configuration} +\label{lst:dockerfile_cpu} +\vspace{0.3em} + +\begin{lstlisting}[language=Dockerfile] +FROM python:3.12-slim + +# Point Python to the external NFS mount +ENV PYTHONPATH=/external-libs/text_env/lib/python3.12/site-packages + +# Copy only the app and model +COPY app.py ./ +COPY model_pytorch_text_classification/ag_news_model.pth ./model/ + +# No 'pip install torch' is performed here! +\end{lstlisting} + +\paragraph{3. Runtime Execution} +The container is launched via \texttt{run\_inference.sh}, which mounts the NFS volume into the container at \texttt{/external-libs}. + +\vspace{0.5em} +\noindent\textbf{Listing 5: GPU-Based Container Execution Command} +\label{lst:docker_gpu_run} +\vspace{0.3em} + +\begin{lstlisting}[language=bash] +docker run --gpus all \ + -v /mnt/text-libs:/external-libs \ + -v text_model_vol:/models \ + -e PYTHONPATH=/external-libs/text_env/lib/python3.12/site-packages \ + -p 8000:8000 \ + text_classification_image +\end{lstlisting} + +\subsubsection{Impact on Image Size} + +This architecture drastically reduces the storage footprint of the inference artifact. By decoupling the static libraries from the application logic, we achieve the following reduction: + +% \begin{table*}[t!] +% \centering +% \begin{tabular}{|l|c|c|} +% \hline +% \textbf{Component} & \textbf{Traditional Approach} & \textbf{Hybrid NFS Approach} \\ \hline +% Base Image (Python Slim) & $\sim$150 MB & $\sim$150 MB \\ \hline +% PyTorch & $\sim$3.5 GB & \textbf{0 MB (Mounted)} \\ \hline +% Transformers & $\sim$500 MB & \textbf{0 MB (Mounted)} \\ \hline +% Application Code & $<1$ MB & $<1$ MB \\ \hline +% Model Weights & $\sim$100 MB & $\sim$100 MB \\ \hline +% \textbf{Total Image Size} & \textbf{8.93 GB} & \textbf{$~250$ MB} \\ \hline +% \end{tabular} +% \caption{Comparison of Docker Image Sizes} +% \end{table*} + +% This \textbf{99.03\% reduction} in image size results in: +% \begin{itemize} +% \item Faster deployment and rollback times. +% \item Significantly lower network bandwidth usage. +% \item Efficient storage utilization on the Docker VM. +% \end{itemize} + +\subsection{Hybrid Inference Architecture with NFS and Docker} + +This section outlines the architectural design of our hybrid machine learning deployment strategy, detailing the distinct roles of the Library VM and the Docker VM, and how they interact to optimize resource usage. + +\subsubsection{Library Virtual Machine (NFS Server)} + +The \textbf{Library VM} serves as the centralized repository for the heavy components of the machine learning environment. Its primary function is to host large, static dependencies such as the Python runtime environment, deep learning frameworks (e.g., PyTorch, TensorFlow), and specialized libraries (e.g., Transformers, CUDA routines). + +By consolidating these resource-intensive libraries on a single machine, we avoid the redundancy of installing them on every inference node. This machine acts as a Network File System (NFS) server, exporting its directory structure to be accessed by other machines in the network. + +\paragraph{What is an NFS Server?} + +A \textbf{Network File System (NFS)} server is a computer that allows other machines (clients) to access its files over a network as if they were stored locally. In our architecture, the NFS server "shares" the directory containing the Python libraries. The client machines can then read these files directly, eliminating the need to physically copy the heavy libraries to each client. + +\subsubsection{Docker Virtual Machine (Inference Node)} + +The \textbf{Docker VM} is the compute-centric node responsible for executing the inference workload. It hosts the Docker engine and runs the lightweight containerized application. + +This machine does not permanently store the heavy ML libraries. instead, it mounts the shared directory from the Library VM at runtime. reliable network connectivity to the Library VM ensures that the Docker container has immediate access to the necessary software dependencies. + +\subsubsection{Hybrid Deployment Strategy} + +The hybrid strategy combines the isolation and portability of Docker with the efficiency of centralized storage. + +\begin{enumerate} + \item \textbf{Decoupling Environment and Application}: We separate the rapidly changing application code (API logic, business rules) from the slowly changing environment (Python packages). The application code resides inside the Docker image, while the environment resides on the NFS share. + \item \textbf{Runtime Linking}: When the Docker container starts, it mounts the NFS share. The container's environment variables are configured to add this mounted path to its Python path. This allows the Python interpreter inside the container to import modules (like \texttt{torch} or \texttt{transformers}) from the network share as if they were installed locally. + \item \textbf{Drastic Image Reduction}: Since the Docker image only contains the application code and minimal system dependencies, its size is reduced from several gigabytes to a few hundred megabytes. This facilitates rapid deployments, faster scaling, and reduced storage costs. +\end{enumerate} + +This architecture essentially transforms the Docker container into a lightweight "shell" that borrows its heavy "engine" from the Library VM only when needed. + +\subsubsection{Identifying the Virtual Machine Roles} + +The architecture explicitly designates two separate machines for distinct purposes. Based on the configuration scripts, their roles are defined as follows: + +\paragraph{1. The Library VM (Environment Host)} +This machine acts as the \textbf{storage backend} for the machine learning environment. +\begin{itemize} + \item \textbf{Role}: It hosts the actual Python environment (Torch, Transformers, etc.) on its local filesystem and exports it via NFS. + \item \textbf{Identifier}: In our configuration (see \texttt{mount\_libs.sh}), this machine is identified by the IP address \texttt{172.16.203.14}. + \item \textbf{Key Path}: The environment resides at \texttt{/home/iiitb/Documents/textClassificationVolume}. + \item \textbf{Action}: It does \textit{not} run the Docker container. Ideally, it simply stays online to serve files to other machines. +\end{itemize} + +\paragraph{2. The Docker VM (Inference Runner)} +This machine acts as the \textbf{compute frontend} that serves the API. +\begin{itemize} + \item \textbf{Role}: It builds and runs the lightweight Docker container. It does not have the deep learning libraries installed on its own disk; it borrows them from the Library VM. + \item \textbf{Identifier}: This is the machine where you execute the \texttt{mount\_libs.sh} and \texttt{run\_inference.sh} scripts. + \item \textbf{Key Path}: It mounts the remote library to the local path \texttt{/mnt/text-libs}. + \item \textbf{Action}: It executes the \texttt{docker run} command, effectively "bringing the code to the data" (or in this case, bringing the library data to the code container). +\end{itemize} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|l|l|} +\hline +\textbf{Feature} & \textbf{Library VM} & \textbf{Docker VM} \\ \hline +\textbf{Primary Function} & Storage \& NFS Server & Model Inference \& API Hosting \\ \hline +\textbf{IP Address} & \texttt{172.16.203.14} & (Assigned by Network) \\ \hline +\textbf{Python Libs} & Stored Physically on Disk & Mounted via Network (NFS) \\ \hline +\textbf{Docker Image} & Not required & Builds \& Runs Lightweight Image \\ \hline +\end{tabular} +\caption{Distinction between Library VM and Docker VM} +\end{table*} + + + + +\newpage + +\subsection{Container Comparison: Python vs Rust} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 4.02 GB & $\sim$1 GB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table*} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + + +\subsection{Model Size Comparison} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table*} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + +\clearpage + +\section{Task: LSTM implementation} +\newpage + +\subsection{Introduction} +This document outlines the detailed architectural, mathematical, and translational specifics of implementing a Long Short-Term Memory (LSTM) model across two prominent machine learning environments: Rust (using the Burn framework) and Python (using PyTorch). It covers the model architecture, training pipelines, specialized deployment techniques using network filesystems (NFS) with Docker, and language-specific design implications. + +\subsection{Model Architecture and Mathematical Formulation} + +\subsubsection{Mathematical Foundation of the LSTM Cell} +The core of the model revolves around a custom, manually-implemented LSTM cell. Instead of relying on the standard un-inspectable black-box LSTM implementations provided by typical ML libraries, both codebases explicitly define the cell-level math. + +For a given timestep $t$, the input tensor $x_t$ and the previous hidden state $h_{t-1}$ are used to compute the various gates. The mathematical formulation utilized is: +\begin{align} + f_t &= \sigma(W_f \cdot [h_{t-1}, x_t] + b_f) \quad &\text{(Forget Gate)} \\ + i_t &= \sigma(W_i \cdot [h_{t-1}, x_t] + b_i) \quad &\text{(Input Gate)} \\ + g_t &= \tanh(W_g \cdot [h_{t-1}, x_t] + b_g) \quad &\text{(Candidate State)} \\ + o_t &= \sigma(W_o \cdot [h_{t-1}, x_t] + b_o) \quad &\text{(Output Gate)} +\end{align} +\begin{align} + c_t &= f_t \odot c_{t-1} + i_t \odot g_t \quad &\text{(New Cell State)} \\ + h_t &= o_t \odot \tanh(c_t) \quad &\text{(New Hidden State)} +\end{align} + +Where: +\begin{itemize} + \item $\sigma$ represents the Sigmoid activation function. + \item $\tanh$ represents the Hyperbolic Tangent activation function. + \item $\odot$ denotes element-wise multiplication (Hadamard product). + \item $[h_{t-1}, x_t]$ symbolizes the concatenation of the previous hidden state and the current input. +\end{itemize} + +\subsubsection{Architectural Details} +Both implementations adhere strictly to the following architectural design: +\begin{enumerate} + \item \textbf{Layer Normalization:} Pre-activation gates, the cell state ($c_t$), and the hidden state ($h_t$) pass through separate \texttt{LayerNorm} layers. This design choice stabilizes training dynamics since the feature distributions inside the LSTM evolve at every sequence step (making standard Batch Normalization ineffective). + \item \textbf{Optimized Gate Compute:} Instead of computing 4 separate linear transformations per timestep for the features, the model employs a single combined projection that outputs a $4 \times \text{hidden\_size}$ tensor. This tensor is subsequently split into four chunks corresponding to the $i, f, g$, and $o$ gates. + \item \textbf{Bidirectional Support:} An encapsulated \texttt{StackedLstm} module stacks multiple manual LSTM layers (applying dropout between layers except on the final one). The main \texttt{LstmNetwork} integrates a forward processing stack and an optional backward processing stack (which flips the temporal dimension of the input sequence). Their respective output hidden states are concatenated along the feature dimension before passing through a fully-connected projection head. + \item \textbf{Initialization bias:} The forget-gate bias parameters are explicitly initialized to $1.0$ (via Xavier Normal parameter slicing) to prevent fatal early-training gradient decay. +\end{enumerate} + + +\subsection{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{RustLocust/LSTM.pdf} + + + +\subsection{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{PythonLocust/LSTM.pdf} + +\subsection{Training Pipeline} +The training behavior is intentionally synchronized to ensure parity between the languages: +\begin{itemize} + \item \textbf{Data Loading:} Operates synchronously on synthetically generated noisy sequential datasets. The validation set is scaled symmetrically relative to the training set ($20\%$ of training size). + \item \textbf{Optimization Algorithm:} Utilizes the Adam Optimizer. + \item \textbf{Loss Function:} Mean Squared Error (MSE), with reduction set to \textit{mean}. Both explicitly weigh loss accumulation during epoch passes by scaling local batch losses by the discrete batch size, averaging properly at the conclusion of the epoch. + \item \textbf{Gradient Clipping:} Ensures numerical stability on longer sequence inputs. The gradient norm is strictly clipped to $\max = 1.0$ right before the optimizer steps. + \item \textbf{Artifacts Output:} Training scripts generate an \texttt{artifact\_dir} where they store a \texttt{config.json} representation of hyperparameters, and the full state dictionary (\texttt{model.pt} in PyTorch; CompactRecorder files in Rust Burn). +\end{itemize} + +\subsection{Inference Pipeline and Docker NFS Integration} +\subsubsection{PyTorch Inference Architecture} +A critical requirement for modern PyTorch inference deployments is resolving the massive disk footprint of CUDA-enabled PyTorch backend libraries. The PyTorch pipeline employs a sophisticated Network File System (NFS) logic to achieve a highly optimized, lightweight Dockerized inference deployment: +\begin{enumerate} + \item \textbf{External Library Mounting:} A host-level script (\texttt{mount\_libs.sh}) maps an external NAS/NFS storage partition (from \texttt{172.16.203.14}) loaded with Python environments targeting \texttt{/mnt/LSTM-libs}. + \item \textbf{Optimized Dockerfile:} The image leverages the \texttt{nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04} base image and installs basic \texttt{python3.11} runtime headers without calling \texttt{pip install torch}. Thus, the final image size is structurally negligible compared to standard ml-images. + \item \textbf{Runtime Binding:} The inference container bootloader scripts (\texttt{run\_container.sh}) bind these volume mounts (\texttt{-v \$NFS\_MOUNT\_POINT:/external-libs}) and crucially overrides the \texttt{PYTHONPATH} env-variable: + \begin{lstlisting} + -e PYTHONPATH="$CONTAINER_LIB_MOUNT/LSTM_env/lib/.../site-packages" + \end{lstlisting} + \item \textbf{Inference Execution:} \texttt{app.py} loads the model weights off an abstracted configuration path, builds a zero-gradient loader, runs inference iteratively over a single collapsed batch, and yields predictions natively. +\end{enumerate} + +\subsubsection{Rust Inference Architecture} +Rust's inference pipeline diverges significantly regarding deployment complexity due to compilation structures: +\begin{itemize} + \item \textbf{Stateless Binaries:} No containerized runtime libraries are mandated because Burn compiles statically down to heavily optimized binaries, pulling model states directly via the \texttt{CompactRecorder}. + \item \textbf{Visualization:} Results are mapped into native Polars \texttt{DataFrame} objects (\texttt{df![]}) rendering lightweight native tables detailing \textit{expected targets} versus \textit{computed predictions}. +\end{itemize} + +\subsection{Implementation Specifics} +\subsubsection{PyTorch Specific Constraints} +\begin{itemize} + \item \textbf{Dynamic computation graphing:} The \texttt{model.py} cleanly slices and chunks gates natively on tensors (e.g., \texttt{gates.chunk(4, dim=1)}). + \item \textbf{Sequence Reversals:} Done programmatically via continuous \texttt{Tensor.flip(dims=[1])} which mandates that tensors must remain contiguously stored within PyTorch internals to avoid memory reallocation overhead. + \item \textbf{Seed Setting API:} Requires deterministic locking across four sub-systems (\texttt{random, numpy, torch, torch.cuda}) to match Rust's reproducibility parameters. +\end{itemize} + +\subsubsection{Rust (Burn) Specific Constraints} +\begin{itemize} + \item \textbf{Compile-Time Dimension Types:} Rust explicitly binds Tensor dimensionality at compile time (\texttt{Tensor} vs \texttt{Tensor}). This offers un-matched safety by forbidding invalid dimension injections that PyTorch would crash on dynamically. + \item \textbf{Trait Encapsulation:} Leverages explicit trait architectures (\texttt{\#[derive(Module, Config)]}) that automate saving hyperparameters and generating gradient backends. Burn models must be mapped cleanly from standard states to \texttt{autodiff} states. + \item \textbf{No-Mutation Logic:} State mutations generated sequentially in LSTMs are represented safely utilizing explicit tuple destructuring via \texttt{LstmState\{hidden, cell\}}, bypassing complex internal pointer tracking. + \item \textbf{Explicit Initialization Handling:} Since Burn limits orthogonal initializes out-of-the-box, Xavier Normalization was invoked explicitly, paired with \texttt{slice\_assign} tensor mappings to safely load the 1.0 uniform fill into the forget-gate components. +\end{itemize} + +\subsection{Rust: Training Loss Progression and Model Convergence} + +The model was trained for a total of 30 epochs. During training, both the training loss and validation loss decreased substantially, indicating that the model was able to learn meaningful patterns from the data. + +\subsubsection{Training Progress} + +The training process began with relatively high loss values. However, as training progressed, both the average training loss and average validation loss consistently decreased. + +The recorded loss values at different stages of training are shown below: + +\begin{table*}[t!] +\centering +\caption{Training and Validation Loss Progression} +\begin{tabular}{|c|c|c|} +\hline +\textbf{Epoch} & \textbf{Average Training Loss} & \textbf{Average Validation Loss} \\ +\hline +5 & 4456.9658 & 4473.4448 \\ +10 & 2510.1016 & 2438.3970 \\ +15 & 900.7457 & 801.6573 \\ +20 & 154.4127 & 164.8311 \\ +25 & 48.2149 & 20.1441 \\ +30 & 52.1122 & 17.5850 \\ +\hline +\end{tabular} +\label{tab:loss_progression} +\end{table*} + +\subsubsection{Loss Trend Analysis} + +The training loss decreased from 4456.9658 at epoch 5 to 52.1122 at epoch 30. Similarly, the validation loss decreased from 4473.4448 to 17.5850 over the same period. + +This large reduction in both training and validation loss suggests that the model successfully converged during training. + +Although the training loss slightly increased between epoch 25 and epoch 30, the validation loss continued to decrease. This indicates that the model continued to improve its ability to generalize to unseen data. + +The lowest validation loss achieved during the experiment was: + +\[ +17.5850 +\] + +at epoch 30. + +\subsubsection{Generalization Performance} + +The close alignment between the training loss and validation loss throughout training suggests that the model did not suffer from severe overfitting. + +In the earlier epochs, both losses were very high, which is expected because the model parameters were still being optimized. As training continued, the losses dropped rapidly, especially between epochs 10 and 25. + +This behavior indicates that the model learned most of its predictive capability during the middle phase of training. + +\subsubsection{Execution Time} + +The complete training process required: + +\begin{itemize} + \item Real time: 6 minutes and 42.185 seconds + \item User CPU time: 6 minutes and 42.414 seconds + \item System CPU time: 3.49 seconds +\end{itemize} + +The relatively low system CPU time compared to user CPU time suggests that most of the runtime was spent performing model computation rather than operating system overhead. +\subsection{Python: LSTM Model Architecture and Training Performance} + +The Long Short-Term Memory (LSTM) model consists of a 2-layer bidirectional LSTM followed by a fully connected output layer. Dropout is applied between LSTM layers to improve generalization. + +\textbf{Model Architecture:} + + +The model was trained for 30 epochs. Training and validation performance improved significantly and consistently. + +Training loss decreased from 5543.18 to 56.11 (98.99\% reduction), while validation loss decreased from 5699.26 to 48.92. +The model achieved strong regression performance, with validation RMSE reaching 6.99 and MAE reaching 4.33. + +The $R^2$ score improved from negative values to 0.9517, indicating strong predictive capability. + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min} & \textbf{Epoch} & \textbf{Max} & \textbf{Epoch} \\ +\hline + +Train & Loss & 56.11 & 30 & 5543.18 & 1 \\ +Train & RMSE & 7.49 & 30 & 74.45 & 1 \\ +Train & MAE & 5.03 & 30 & 67.25 & 1 \\ +Train & R$^2$ & -4.41 & 1 & 0.9452 & 30 \\ +Train & Grad Norm (Total) & 524.61 & 1 & 4972.41 & 24 \\ +Train & Iteration Speed (it/s) & 5.02 & 29 & 6.94 & 4 \\ +Train & CPU Memory (GB) & 0.87 & -- & 1.13 & -- \\ +Train & CPU Usage (\%) & 72.4 & -- & 88.8 & -- \\ +\hline + +Valid & Loss & 47.96 & 29 & 5699.26 & 1 \\ +Valid & RMSE & 6.93 & 29 & 75.49 & 1 \\ +Valid & MAE & 3.54 & 28 & 68.51 & 1 \\ +Valid & R$^2$ & -4.63 & 1 & 0.9526 & 29 \\ +Valid & Iteration Speed (it/s) & 5.02 & 29 & 6.94 & 4 \\ +Valid & CPU Memory (GB) & 0.87 & -- & 1.13 & -- \\ +Valid & CPU Usage (\%) & 72.4 & -- & 88.8 & -- \\ +\hline + +\end{tabular} +\caption{Extended LSTM Training and Validation Metrics Summary} +\end{table*} + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 158.63 seconds + \item Average Epoch Time: 5.18 seconds + \item Iteration Speed (Mean): 6.22 it/s + \item Gradient Norm (Mean): 1394.59 + \item NaN Events: 0 + \item Convergence: Monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + + + + +\newpage + +\subsection{Container Comparison: Python vs Rust} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 3.98GB & 974MB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table*} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + + +\subsection{Model Size Comparison} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table*} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + +\clearpage + +\EOD +\end{document} \ No newline at end of file diff --git a/latex_reports/draft_2.tex b/latex_reports/draft_2.tex new file mode 100644 index 0000000..b92a0d1 --- /dev/null +++ b/latex_reports/draft_2.tex @@ -0,0 +1,1297 @@ +\documentclass{ieeeaccess} +\usepackage{cite} +\usepackage{amsmath,amssymb,amsfonts} +\usepackage{algorithmic} +\usepackage{textcomp} + +\def\BibTeX{{\rm B\kern-.05em{\sc i\kern-.025em b}\kern-.08em + T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}} + +% Encoding and fonts +\usepackage[T1]{fontenc} +\usepackage[utf8]{inputenc} +\usepackage{pdfpages} +\usepackage{enumitem} +\setlist{noitemsep,topsep=0pt,parsep=0pt,partopsep=0pt} + +% Math and graphics +\usepackage{graphicx} +\usepackage{booktabs} +\usepackage{array} +\usepackage{float} + +% URLs (robust line breaking) +\usepackage{url} +\usepackage[hidelinks]{hyperref} +\def\UrlBreaks{\do/\do-\do_} + +% Code listings (stable for IEEE) +\usepackage{listings} +\usepackage{xcolor} +\lstset{ + basicstyle=\ttfamily\footnotesize, + breaklines=true, + breakatwhitespace=false, % <-- IMPORTANT (change this) + columns=fullflexible, + keepspaces=true, + showstringspaces=false +} + +% Custom Dockerfile language +\lstdefinelanguage{Dockerfile}{ + keywords={FROM, RUN, CMD, LABEL, MAINTAINER, EXPOSE, ENV, ADD, COPY, ENTRYPOINT, VOLUME, USER, WORKDIR, ARG, ONBUILD, STOPSIGNAL, HEALTHCHECK, SHELL}, + sensitive=true, + comment=[l]{\#}, + morestring=[b]" +} + +\begin{document} +\history{Date of publication xxxx 00, 0000, date of current version xxxx 00, 0000.} +\doi{10.1109/ACCESS.2017.DOI} + +\title{System-Level Evaluation of Rust and Python for Machine Learning} +\author{\uppercase{Project Elective}\authorrefmark{1}, +\IEEEmembership{Member, IEEE}} +\address[1]{Project Elective (e-mail: project@elective.com)} +\tfootnote{This paragraph of the first footnote will contain support +information, including sponsor and financial support acknowledgment. For +example, ``This work was supported in part by the U.S. Department of +Commerce under Grant BS123456.''} + +\markboth +{Project Elective \headeretal: System-Level Evaluation of Rust and Python for Machine Learning} +{Project Elective \headeretal: System-Level Evaluation of Rust and Python for Machine Learning} + +\corresp{Corresponding author: Project Elective (e-mail: project@elective.com).} + +\begin{abstract} +These instructions give you guidelines for preparing papers for +IEEE Access. Use this document as a template if you are +using \LaTeX. Otherwise, use this document as an +instruction set. The electronic file of your paper will be formatted further +at IEEE. Paper titles should be written in uppercase and lowercase letters, +not all uppercase. Avoid writing long formulas with subscripts in the title; +short formulas that identify the elements are fine (e.g., "Nd--Fe--B"). Do +not write ``(Invited)'' in the title. Full names of authors are preferred in +the author field, but are not required. Put a space between authors' +initials. The abstract must be a concise yet comprehensive reflection of +what is in your article. In particular, the abstract must be self-contained, +without abbreviations, footnotes, or references. It should be a microcosm of +the full article. The abstract must be between 150--250 words. Be sure that +you adhere to these limits; otherwise, you will need to edit your abstract +accordingly. The abstract must be written as one paragraph, and should not +contain displayed mathematical equations or tabular material. The abstract +should include three or four different keywords or phrases, as this will +help readers to find it. It is important to avoid over-repetition of such +phrases as this can result in a page being rejected by search engines. +Ensure that your abstract reads well and is grammatically correct. +\end{abstract} + +\begin{keywords} +Enter key words or phrases in alphabetical +order, separated by commas. For a list of suggested keywords, send a blank +e-mail to keywords@ieee.org or visit \underline +{http://www.ieee.org/organizations/pubs/ani\_prod/keywrd98.txt} +\end{keywords} + +\titlepgskip=-15pt + +\maketitle +\section{Overview of the Project} + +This project studies the use of \textbf{Rust} as an alternative systems language for machine learning workflows traditionally implemented in \textbf{Python}. +Rather than focusing on state-of-the-art model performance, the emphasis is on: + +\begin{itemize} + \item feasibility of end-to-end ML workflows, + \item system stability and reproducibility, + \item developer experience and DevOps complexity, + \item deployment and operational characteristics. +\end{itemize} + +To ensure clarity and rigor, the work is organized into \textbf{two clearly separated experimental tracks}. + +\hrule + +\section{Project Structure: Two-Track Evaluation} + +The project consists of the following two tracks: + +\subsection*{Track 1: Training-Based Systems Evaluation} +This track compares \textbf{machine learning training pipelines} implemented in: +\begin{itemize} + \item PyTorch (Python), and + \item Burn (Rust). +\end{itemize} + +The goal is to evaluate training feasibility, stability, compile-time guarantees, and DevOps impact, rather than raw training speed. + +\subsection*{Track 2: Inference-Based DevOps Evaluation} +This track compares \textbf{production-style inference services} implemented in: +\begin{itemize} + \item Python-based ONNX inference, and + \item Rust-based ONNX inference. +\end{itemize} + +The focus is on deployment, security, containerization, CI/CD behavior, and runtime efficiency. + +Each track is designed to answer a distinct research question while remaining complementary. + +\section{Machine Learning Tasks Considered} + +To ensure coverage of diverse ML workloads, the following tasks are identified: + +\begin{itemize} + \item \textbf{Text Classification}: Dataset to be finalized. + \item \textbf{Image Classification}: MNIST dataset. + \item \textbf{Credit Score Assignment}: Supervised classification task. + \item \textbf{Multi-Objective Machine Learning}: Brain Tumor dataset with a MOML formulation. + \item \textbf{Fine-Tuning Task}: BERT-based classification (ANLP Assignment 1), with optional LoRA / QLoRA. + \item \textbf{Autoregressive Decoding}: Experiments using the Burn framework. +\end{itemize} + +At the current stage, the \textbf{MNIST image classification task has been fully implemented}. +The corresponding training code is available in the project GitHub repository. + +\hrule + +\section{Related Work} + +The following research papers are being used to guide experimental design and evaluation: + +\begin{itemize} + \item \url{https://ieeexplore.ieee.org/document/11126113} + \item \url{https://ieeexplore.ieee.org/document/11261485} + \item \url{https://ieeexplore.ieee.org/document/11212348} + \item \url{https://www.ijsred.com/volume8/issue2/IJSRED-V8I2P143.pdf} +\end{itemize} + +\hrule + +\section{Code Repository and Current Status} + +Project repository: +\begin{center} +\url{https://github.com/Abhinav-Kumar012/Rust_Python_ML_PE.git} +\end{center} + +Current progress includes: +\begin{itemize} + \item MNIST training pipeline implemented + \item PyTorch baseline established + \item Initial Rust (Burn) training setup completed +\end{itemize} + +\hrule + +\section{Track 1: Training-Based Systems Evaluation} + +\subsection{Objective} + +The objective of this track is to answer the following research question: + +\begin{quote} +\textit{Can Rust realistically support end-to-end machine learning training pipelines, and what system-level trade-offs does this introduce compared to PyTorch?} +\end{quote} + +This track explicitly avoids speed-centric benchmarking and instead focuses on system behavior. + +\hrule + +\subsection{Frameworks Compared} + +\subsubsection{PyTorch (Baseline)} +\begin{itemize} + \item Language: Python + \item Training maturity: Very high + \item Ecosystem: Extensive +\end{itemize} + +\subsubsection{Rust (Burn)} +\begin{itemize} + \item Language: Rust + \item Training maturity: Emerging + \item Design: Idiomatic Rust, native training support +\end{itemize} + +\hrule + +\subsection{Experimental Controls} + +\textbf{Fixed Across Both Implementations} +\begin{itemize} + \item Dataset splits + \item Number of epochs + \item Batch size + \item Optimizer type + \item Learning rate + \item Hardware +\end{itemize} + +\textbf{Allowed Differences} +\begin{itemize} + \item Internal kernel implementations + \item Graph execution model + \item Memory management +\end{itemize} + +\hrule + +\subsection{Metrics Collected} + +\begin{itemize} + \item Training time per epoch (reported cautiously) + \item Loss curves and convergence behavior + \item Runtime failures and numerical stability + \item Reproducibility across runs + \item Environment setup and build complexity + \item Dependency footprint and artifact size +\end{itemize} + +\hrule + +\section{Track 2: Inference-Based DevOps Evaluation} + +\subsection{Objective} + +The objective of this track is to compare \textbf{deployment, security, and operational characteristics} of Python-based and Rust-based ML inference services executing the same ONNX model. + +\hrule + +\subsection{Inference Services Compared} + +\textbf{Python Service} +\begin{itemize} + \item FastAPI + Uvicorn + \item ONNX Runtime (Python) +\end{itemize} + +\textbf{Rust Service} +\begin{itemize} + \item Axum / Actix + \item burn-rs +\end{itemize} + +Both services expose identical inference endpoints and return identical outputs. + +\hrule + +\subsection{Evaluation Dimensions} + +\begin{itemize} + \item CI/CD build behavior + \item Container image size and layering + \item Cold-start latency + \item Inference latency and throughput + \item Resource utilization + \item Security and supply-chain surface +\end{itemize} + +\hrule + +\section{Upcoming Work} + +The following tasks are planned for the next phase of the project: +s +\begin{itemize} + \item Develop production-style inference services for both Python and Rust. + \item Write Dockerfiles for Python and Rust inference services. + \item Set up Jenkins-based CI pipelines for inference, including build, test, containerization, and security scanning. +\end{itemize} + +\section{Task: MNIST Image Classification} +\subsection{Architecture Details} + + + + +\subsection{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\begin{figure}[H] +\centering +\includegraphics[width=\linewidth]{RustLocust/MNIST.pdf} +\caption{Rust Backend Load Testing Dashboard for MNIST} +\end{figure} + +\subsection{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\begin{figure}[H] +\centering +\includegraphics[width=\linewidth]{PythonLocust/MNIST.pdf} +\caption{Python Backend Load Testing Dashboard for MNIST} +\end{figure} + +\subsection{Docker Containerization Strategy} +Both Rust and Python inference workflows leverage highly optimized container strategies. For Rust, a multi-stage Docker build compiles the application within an \texttt{ubuntu:16.04} builder and transfers the standalone binary to a minimal \texttt{nvidia/vulkan:1.3-470} runtime image. + + + + +The Python (PyTorch) container minimizes footprint by avoiding framework installation inside the image. Utilizing \texttt{python:3.12-slim}, it only installs \texttt{fastapi} and maps heavy ML dependencies at runtime via an external NFS volume (\texttt{PYTHONPATH} override). This reduces the image size from gigabytes to under 150MB, drastically accelerating deployments. + + +\subsection{Rust: CNN Model Architecture and Training Performance} + +The convolutional neural network (CNN) model used for the experiment consisted of two convolutional layers followed by adaptive average pooling, dropout, and two fully connected layers. The complete architecture is shown below: + +\begin{lstlisting} +Model { + conv1: Conv2d {ch_in: 1, ch_out: 8, stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Valid, params: 80} + conv2: Conv2d {ch_in: 8, ch_out: 16, stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Valid, params: 1168} + pool: AdaptiveAvgPool2d {output_size: [8, 8]} + dropout: Dropout {prob: 0.5} + linear1: Linear {d_input: 1024, d_output: 512, bias: true, params: 524800} + linear2: Linear {d_input: 512, d_output: 10, bias: true, params: 5130} + activation: Relu + params: 531178 +} +\end{lstlisting} + +The model was trained for 10 epochs. Over the course of training, both the training and validation performance improved consistently. Training accuracy increased from 81.575\% in the first epoch to 97.300\% in the final epoch, while validation accuracy improved from 92.133\% to 98.517\%. + +Similarly, the training loss decreased significantly from 0.656 to 0.087, and the validation loss reduced from 0.258 to 0.054 by the end of training. The macro F1-score also improved substantially, reaching 96.974\% for training and 98.321\% for validation. + + + +The results indicate that the model achieved strong generalization performance with minimal overfitting, as the validation accuracy remained slightly higher than the training accuracy throughout the experiment. The consistently high Top-5 accuracy values further demonstrate that the model was able to correctly identify the correct class within its top predictions. + +It should also be noted that the execution terminated with a segmentation fault after training completion. However, since the fault occurred after all epochs had been completed and metrics had already been recorded, it did not affect the validity of the training results. \\ + +The time taken to train the model is 229.916s (3min 49.916s). + + + +\subsection{Python: CNN Model Architecture and Training Performance} + +The convolutional neural network (CNN) model implemented in Python mirrors the Rust architecture, consisting of two convolutional layers followed by adaptive average pooling, dropout regularization, and two fully connected layers. + +\textbf{Model Architecture:} + +\begin{lstlisting} +Model { + conv1: Conv2d {ch_in: 1, ch_out: 8, kernel_size: [3, 3], stride: [1, 1]} + conv2: Conv2d {ch_in: 8, ch_out: 16, kernel_size: [3, 3], stride: [1, 1]} + pool: AdaptiveAvgPool2d {output_size: [8, 8]} + dropout: Dropout {prob: 0.5} + linear1: Linear {d_input: 1024, d_output: 512, params: 524800} + linear2: Linear {d_input: 512, d_output: 10, params: 5130} + activation: ReLU + total params: 531178 +} +\end{lstlisting} + +The model was trained for 10 epochs. Training and validation performance improved consistently over time. + +Training accuracy increased from 82.01\% to 97.29\%, while validation accuracy improved from 92.87\% to 98.20\%. +Training loss decreased significantly from 0.5947 to 0.0867, and validation loss reduced from 0.2475 to 0.0579. + +The macro F1-score reached 0.9814, demonstrating strong classification performance. Additionally, the Top-5 accuracy achieved 99.98\%, indicating highly reliable predictions. + + + +The results indicate strong generalization performance with no signs of overfitting. Validation accuracy remained consistently high and closely followed training accuracy. + +Training was stable with zero NaN events observed. The total training time was 182.23 seconds, with an average epoch time of 15.05 seconds and an average iteration speed of 50.58 iterations per second. + + + +\section{Task: Regression} + +\subsection{Introduction} +This document outlines the architectural, mathematical, and deployment specifics of implementing a Neural Network-based Regression model across two disparate machine learning environments: Rust (utilizing the Burn framework) and Python (utilizing PyTorch). It covers the distinct model architecture decisions, dataset handling strategies, and specialized pipeline deployment techniques leveraging Network File Systems (NFS) mapping via Docker bounds. + +\subsection{Model Architecture and Mathematical Formulation} + +\subsubsection{Mathematical Foundation} +The core mathematical foundation deployed across both frameworks is a classical Feed-Forward Neural Network consisting of a single hidden dimension mapping inputs directly onto a continuous single-variable regression output. + +For a given input feature vector $X \in \mathbb{R}^N$ (where $N$ dictates the feature size depending on the target dataset), the network's forward transformation can be represented sequentially as: +\begin{align} + Z_1 &= X \cdot W_1^T + b_1 \quad &\text{(Input Projection)} \\ + A_1 &= \max(0, Z_1) \quad &\text{(ReLU Activation)} \\ + \hat{Y} &= A_1 \cdot W_2^T + b_2 \quad &\text{(Output Projection)} +\end{align} + +Where: +\begin{itemize} + \item $W_1 \in \mathbb{R}^{H \times N}$ and $b_1 \in \mathbb{R}^H$ map the inputs onto the hidden vector space $H$. + \item $\max(0, \cdot)$ denotes the Non-Linear Rectified Linear Unit (ReLU) mapping algorithm. + \item $W_2 \in \mathbb{R}^{1 \times H}$ and $b_2 \in \mathbb{R}$ collapse the hidden abstraction onto the finalized regression scalar prediction $\hat{Y}$. +\end{itemize} + +\subsubsection{Architectural Configurations} +While the mathematical foundations are identical, implementations slightly differ based on dataset selections within the modules: +\begin{itemize} + \item \textbf{PyTorch Architecture:} Configures $N=13$ input features mapping to $H=64$ hidden parameters. + \item \textbf{Rust (Burn) Architecture:} Configures $N=8$ input features concurrently mapping to $H=64$ hidden parameters. +\end{itemize} +In both configurations, standard parameter biases (`bias=True`) are included and automatically initialized. + +\subsection{Training Pipelines} + +Both codebases train the model iteratively tracking gradients via the Adam optimizer scaled against Mean Squared Error (MSE) loss logic: +\[ \text{MSE} = \frac{1}{B} \sum_{i=1}^{B} (Y_i - \hat{Y}_i)^2 \] + +\subsubsection{PyTorch Context} +\begin{itemize} + \item \textbf{Data Loading:} Automatically pulls the \textbf{Boston Housing} dataset array (.npz file) from an external Google API via `urllib` and manually partitions it down into an explicit $80/20$ split. + \item \textbf{Telemetry Metrics:} Generates explicit hardware tracking loops inside the main epoch runner. Uses the `psutil` library to compute and stream epoch `iteration\_speed`, raw RAM consumption, and `cpu\_temp` hardware sensors parallel to the loss parameters. +\end{itemize} + +\subsubsection{Rust (Burn) Context} +\begin{itemize} + \item \textbf{Data Loading:} Links into Huggingface's dataset registry asynchronously targeting the \textbf{California Housing} SQLized splits mapping onto memory arrays via localized `HousingDistrictItem` structs. + \item \textbf{Normalization Mapping:} Computes spatial min-max normalizations programmatically over inputs during training: + \[ X_{norm} = \frac{X - \text{min}}{\text{max} - \text{min}} \] + This logic restricts features within standard boundaries precluding exploding gradient derivations. +\end{itemize} + +\subsection{Inference Pipeline and Docker NFS Integration} + +Deploying these isolated pipelines necessitates radically different execution strategies, highlighting Python's heavyweight runtime dependency bottlenecks versus Rust's compile-time optimizations. + +\subsubsection{PyTorch Inference Architecture} +Standard PyTorch Docker environments routinely eclipse several gigabytes due to CUDA bindings and generic scientific computation loops. To circumvent this inside microservices, the PyTorch inference pipeline mandates a hybrid Network File System (NFS) mapping architecture: +\begin{enumerate} + \item \textbf{NFS Mounting (\texttt{mount\_libs.sh}):} Installs an external `nfs-common` client locally and binds the extensive python library volume from an external dedicated storage server (`172.16.203.14`) into the host machine's `/mnt/LSTM-libs` map. + \item \textbf{Lightweight Container Image:} The backend \texttt{Dockerfile} avoids `pip install` commands completely, simply initializing a barebone `nvidia/cuda:12.1.1` image mapping Python $3.11$ system links. + \item \textbf{Volume Inject (\texttt{run\_container.sh}):} The script initializes the container enforcing `-v` flags that sync the NFS `/mnt/LSTM-libs` directory seamlessly onto the Docker's `/external-libs`. Crucially, it overrides the system \texttt{PYTHONPATH} to target those external `site-packages` at runtime. + \item \textbf{Execution:} The `FastAPI` instance loads, bypasses massive disk pulls, links the models iteratively, and fields inbound `HousingFeatures` lists continuously. +\end{enumerate} + +\subsubsection{Rust (Burn) Inference Architecture} +Rust handles Docker microservices inherently via statically linked deployments: +\begin{itemize} + \item \textbf{Multi-Stage Compiling:} Executes a build phase operating within an oversized `rust:1.92-alpine` chain, ejecting the resulting binary onto an isolated stripped `alpine:3.23` environment structure. + \item \textbf{Native Routing:} Utilizes \texttt{Axum} servers to establish the HTTP logic endpoints securely routing JSON payloads mapping to specific feature names (e.g. \texttt{median\_income}, \texttt{house\_age}). +\end{itemize} + +\subsection{Rust: Regression Model Performance Analysis} + +The regression model used in this experiment was a simple feed-forward neural network consisting of one hidden layer followed by an output layer. The model was designed to predict the median house value based on eight input features. + +\subsubsection{Model Architecture} + +The architecture of the regression model is shown below: + +\begin{lstlisting} +RegressionModel { + input_layer: Linear {d_input: 8, d_output: 64, bias: true, params: 576} + output_layer: Linear {d_input: 64, d_output: 1, bias: true, params: 65} + activation: Relu + params: 641 +} +\end{lstlisting} + +The model contains: + +\begin{itemize} + \item An input layer that maps 8 input features to 64 hidden units + \item A ReLU activation function applied after the hidden layer + \item An output layer that maps the 64 hidden units to a single scalar value +\end{itemize} + +The total number of trainable parameters in the model was only 641, making it a lightweight model suitable for fast training and inference. + +\subsubsection{Training Configuration} + +The model was trained for 100 epochs. A constant learning rate of: + +\[ +1.0 \times 10^{-3} +\] + +was used throughout the entire training process. + +\subsubsection{Training Performance} + +The training loss decreased substantially over the 100 epochs. Initially, the model started with a training loss of 3.086 during the first epoch. By the final epoch, the loss had reduced to 0.414. + +This significant reduction in loss indicates that the model successfully learned the underlying relationship between the input features and the target variable. + +\subsubsection{Validation Performance} + +Validation loss also showed a considerable improvement during training. The validation loss decreased from 4.132 in the first epoch to a minimum of 0.635 at epoch 51. + +The difference between the final training loss and the minimum validation loss suggests that the model achieved good generalization performance without severe overfitting. + + + +\subsubsection{Prediction Example} + +A sample prediction generated by the model is shown below: + +\begin{lstlisting} +Predicted 2.021734 Expected 2.158 +\end{lstlisting} + +The predicted value is reasonably close to the expected value, indicating that the model was able to approximate the target variable with acceptable accuracy. + +Since the median house value was measured in units of 100,000 dollars, the prediction corresponds to: + +\begin{itemize} + \item Predicted value: approximately 202,173 dollars + \item Expected value: approximately 215,800 dollars +\end{itemize} + +\subsubsection{Predicted vs. Expected Distribution} + +The predicted-versus-expected plot suggests that the model captures the general trend in the target values, although some prediction errors remain for certain samples. + +Most of the predicted values appear concentrated around the central region of the distribution, indicating that the model performs better on common house value ranges than on extreme values. + +\subsubsection{Resource Utilization} + +The model required relatively little memory during execution. Training memory usage ranged from 2.125 GB to 2.325 GB, while validation memory usage ranged from 2.124 GB to 2.325 GB. + +CPU utilization remained moderate throughout training. Training CPU usage ranged from 19.539\% to 37.989\%, while validation CPU usage ranged from 19.550\% to 37.960\%. + +CPU temperature values were unavailable and therefore recorded as NaN. + +\subsubsection{Execution Time and Failure} + +The complete training and evaluation process required: + +\begin{itemize} + \item Real time: 3 minutes and 18.257 seconds + \item User CPU time: 4 minutes and 13.554 seconds + \item System CPU time: 50.340 seconds +\end{itemize} + +\subsection{Language Specific Implementation Details} + +\subsubsection{PyTorch-Specific Paradigms} +\begin{itemize} + \item \textbf{Thread Clamping:} Due to inference optimization restrictions (especially running CPU variations alongside container structures), the `app.py` enforces explicit core binding calls via `torch.set\_num\_threads(1)` and `torch.set\_num\_interop\_threads(1)` securing computational resources and restricting OS context-switching overheads. + \item \textbf{Matrix Array Verifications:} Manually inspects raw matrix vector mappings validating dimensions dynamically against numeric constraints: \texttt{len(x) != NUM\_FEATURES} triggering runtime panics before pipeline evaluations fail. + \item \textbf{Manual Hardware Moving:} The framework is heavily littered with required `.to(device)` mapping configurations switching inputs, datasets, targets, and models manually between the host and external components. +\end{itemize} + +\subsubsection{Rust (Burn)-Specific Paradigms} +\begin{itemize} + \item \textbf{Generic Compile-Time Shapes:} Dimension mappings and tensor validations are fundamentally enforced inside the Rust compiler boundaries via `` arrays indicating batches of distinct input structures mapping to `targets: Tensor`. Invalid sizes fail compilation, voiding the requirement for manual PyTorch matrix validations. + \item \textbf{Struct Batching Protocols:} Inference doesn't evaluate primitive float arrays. Intead, the API relies on executing an overarching `HousingBatcher` which transforms specific struct domains (\texttt{HousingDistrictItem}) safely into tensor primitives while executing implicit `self.normalizer.to\_device(device)` logic silently against constants behind boundaries. + \item \textbf{Record Deserialization:} States are strictly detached from models via standard `.mpk` maps. They invoke explicit \texttt{NoStdTrainingRecorder::new().load()} tracking traits unbinding memory limits inherent to standard dict serialization configurations natively loaded via `RegressionModelConfig`. +\end{itemize} + +\newpage +\subsection{Python: Regression Model Architecture and Training Performance} + +The regression model used in this experiment is a lightweight fully connected neural network with a small number of parameters (961 total). The model is optimized using mean squared error loss. + + + +The model was trained for 100 epochs. Training loss decreased significantly from 8265.55 to 69.86 (99.15\% reduction), while validation loss decreased from 9045.34 to 55.70. + +Despite strong loss reduction, the model struggled to achieve good generalization. The validation $R^2$ score remained negative (-1.07), indicating that the model performs worse than a simple baseline predictor. + + + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 47.39 seconds + \item Average Epoch Time: 0.225 seconds + \item Iteration Speed (Mean): 10.36 it/s + \item Gradient Norm (Mean): 7981.31 + \item NaN Events: 0 + \item Convergence: Non-monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + +The results indicate that while optimization was successful in reducing loss, the model lacks sufficient capacity or feature representation to generalize well. The persistently negative validation $R^2$ suggests underfitting or a mismatch between model complexity and data characteristics. + + + + +\section{Task: Text Classification (AG News)} +\subsection{Model Architecture and Training Strategy} + +The text classification system is built using the \texttt{Burn} framework in Rust, leveraging a Transformer-based architecture for feature extraction and a linear classification head. This section details the mathematical formulation of the model and the strategy employed for training. + +\subsubsection{Model Architecture} +The core of the model is a Transformer Encoder, which processes a sequence of token embeddings to capture contextual relationships. The architecture consists of three primary stages: embedding, encoding, and classification. + +\paragraph{Embedding Layer} +Input text is tokenized and converted into a sequence of indices $X \in \mathbb{N}^{B \times L}$, where $B$ is the batch size and $L$ is the sequence length. The model utilizes two parallel embedding layers: +\begin{enumerate} + \item \textbf{Token Embedding ($E_{tok}$)}: Maps token indices to dense vectors of dimension $d_{model}$. + \item \textbf{Positional Embedding ($E_{pos}$)}: Maps position indices $[0, \dots, L-1]$ to dense vectors of dimension $d_{model}$ to inject sequence order information. +\end{enumerate} + +The final embedding representation $E$ is obtained by averaging the token and positional embeddings: +\begin{equation} + E = \frac{E_{tok}(X) + E_{pos}(\text{positions})}{2} +\end{equation} + +\paragraph{Transformer Encoder} +The embedding tensor $E$ is passed through a multi-layer Transformer Encoder. Each layer consists of a Multi-Head Self-Attention (MHSA) mechanism followed by a Position-wise Feed-Forward Network (FFN), with residual connections and layer normalization. + +The configuration used in this implementation is as follows: +\begin{itemize} + \item \textbf{Model Dimension ($d_{model}$)}: 256 + \item \textbf{Feed-Forward Dimension ($d_{ff}$)}: 1024 + \item \textbf{Number of Heads ($N_{heads}$)}: 8 + \item \textbf{Number of Layers ($N_{layers}$)}: 4 + \item \textbf{Normalization}: Layer norm applied before sub-layers (Pre-Norm). +\end{itemize} + +Let $H = \text{TransformerEncoder}(E)$, where $H \in \mathbb{R}^{B \times L \times d_{model}}$ represents the contextualized representations of the input sequence. + +\paragraph{Classification Head} +For classification, the model utilizes the representation of the first token (typically acting as the [CLS] token) from the encoded sequence. This vector is passed through a linear layer to project it into the class space: +\begin{equation} + Y = \text{Linear}(H_{[:, 0, :]}) +\end{equation} +where $Y \in \mathbb{R}^{B \times N_{classes}}$ represents the logits. For inference, a Softmax function is applied to obtain probabilities: +\begin{equation} + \hat{P} = \text{Softmax}(Y) +\end{equation} + +\begin{table*}[t!] +\centering +\caption{Model Architecture Summary} +\label{tab:model_arch} +\begin{tabular}{|l|l|c|c|} +\hline +\textbf{Component} & \textbf{Configuration / Details} & \textbf{Input Shape} & \textbf{Output Shape} \\ \hline +Token Embedding & $V \to d_{model}$ ($V$: Vocab Size) & $(B, L)$ & $(B, L, 256)$ \\ \hline +Pos Embedding & $L_{max} \to d_{model}$ & $(B, L)$ & $(B, L, 256)$ \\ \hline +Embedding Merge & Average ($E_{tok} + E_{pos}$) & - & $(B, L, 256)$ \\ \hline +Transformer Block & 4 Layers, 8 Heads, $d_{ff}=1024$ & $(B, L, 256)$ & $(B, L, 256)$ \\ \hline +Feature Extract & Slice First Token (Index 0) & $(B, L, 256)$ & $(B, 256)$ \\ \hline +Classifier Head & Linear ($256 \to N_{classes}$) & $(B, 256)$ & $(B, N_{classes})$ \\ \hline +\end{tabular} +\end{table*} + + + +\subsection{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\begin{figure}[H] +\centering +\includegraphics[width=\linewidth]{RustLocust/text.pdf} +\caption{Rust Backend Load Testing Dashboard for Text Classification} +\end{figure} + +\subsection{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\begin{figure}[H] +\centering +\includegraphics[width=\linewidth]{PythonLocust/text.pdf} +\caption{Python Backend Load Testing Dashboard for Text Classification} +\end{figure} + +\subsubsection{Training Strategy} +The model is trained using a supervised learning approach with the following configuration: + +\paragraph{Loss Function} +The training objective is to minimize the Cross-Entropy Loss between the predicted logits $Y$ and the ground truth class labels $C$: +\begin{equation} + \mathcal{L} = \text{CrossEntropy}(Y, C) = -\sum_{c=1}^{N_{classes}} \mathbb{1}_{c=C} \log\left(\frac{e^{Y_c}}{\sum_{j} e^{Y_j}}\right) +\end{equation} + +\paragraph{Optimization} +We employ the **Adam** optimizer with the following parameters: +\begin{itemize} + \item \textbf{Weight Decay}: $5 \times 10^{-5}$ + \item \textbf{Beta Coefficients}: Standard defaults (typically $\beta_1=0.9, \beta_2=0.999$) +\end{itemize} + +\paragraph{Learning Rate Scheduling} +A **Noam Learning Rate Scheduler** is used to stabilize training. The learning rate increases linearly during a warmup phase and then decays proportionally to the inverse square root of the step number. +\begin{equation} +\begin{aligned} +LR &= d_{model}^{-0.5} \cdot \min( \\ + &\quad step\_num^{-0.5}, \\ + &\quad step\_num \cdot warmup\_steps^{-1.5} +) +\end{aligned} +\end{equation} +\begin{itemize} + \item \textbf{Warmup Steps}: 1000 + \item \textbf{Base Learning Rate}: 0.01 +\end{itemize} + +\paragraph{Metrics} +During training and validation, the following metrics are tracked to monitor performance: +\begin{itemize} + \item \textbf{Loss}: Cross-Entropy Loss. + \item \textbf{Accuracy}: Percentage of correct predictions. + \item \textbf{F1-Score, Precision, Recall}: Macro-averaged metrics to account for class balance. +\end{itemize} + +\subsection{Burn Code Specifications} + +This section outlines the significant implementation details of the text classification system, focusing on the architectural choices in \texttt{model.rs} and the robust training pipeline defined in \texttt{training.rs}. +\subsubsection{Model Implementation (\texttt{model.rs})} +The \texttt{TextClassificationModel} leverages the \textbf{Burn} framework's modular design to implement a Transformer-based classifier. Key features of this implementation include: +\begin{itemize} + \item \textbf{Dual Embedding Strategy:} The model employs two distinct embedding layers: \texttt{embedding\_token} for semantic content and \texttt{embedding\_pos} for positional information. A unique characteristic of this implementation is the fusion strategy, where these embeddings are combined via averaging: + \[ + E_{final} = \frac{E_{pos} + E_{token}}{2} + \] + This differs from the standard summation approach often found in BERT implementations, potentially stabilizing the initial magnitude of the embedding vectors. + + \item \textbf{Configurable Architecture:} The system uses a \texttt{TextClassificationModelConfig} struct derived with the \texttt{Config} macro. This allows for type-safe and serializable hyperparameter management, ensuring the model architecture (hidden size, vocabulary size, sequence length) can be easily saved, loaded, and reproducible. + + \item \textbf{Masked Attention:} The forward pass actively utilizes padding masks (\texttt{mask\_pad}). These masks are passed into the \texttt{TransformerEncoderInput}, ensuring that the self-attention mechanism strictly ignores padding tokens, which is critical for handling variable-length text sequences correctly. + + \item \textbf{Separation of Train and Inference Logic:} The model explicitly implements the \texttt{TrainStep} and \texttt{InferenceStep} traits. + \begin{itemize} + \item \textbf{Training:} Returns a \texttt{ClassificationOutput} struct containing the calculated Cross-Entropy loss for backpropagation. + \item \textbf{Inference:} Returns raw probabilities by applying a softmax activation on the output logits, facilitating direct class prediction. + \end{itemize} +\end{itemize} +\subsubsection{Training Pipeline (\texttt{training.rs})} +The training module is designed for reliability and comprehensive observability. It integrates advanced optimization techniques and hardware-aware monitoring. +\begin{itemize} + \item \textbf{Noam Scheduler:} Transformer models are notoriously sensitive to learning rates. The code implements the \textbf{Noam Learning Rate Scheduler} (popularized by "Attention Is All You Need"), which features a linear warmup phase (1000 steps) followed by an inverse square root decay based on the model dimension ($d_{model}$). This prevents gradient explosions during early training stages. + + \item \textbf{Distributed Training Support:} The implementation explicitly handles distributed computing scenarios. It utilizes Rust's feature flags (\texttt{cfg[feature = "ddp"]}) to switch between single-device training and \textbf{Distributed Data Parallel (DDP)} strategies. When enabled, it employs a tree-based \texttt{AllReduceStrategy} for synchronizing gradients across multiple GPUs or nodes. + + \item \textbf{Comprehensive Telemetry:} The training loop is instrumented with an extensive suite of metrics beyond simple accuracy. It tracks: + \begin{itemize} + \item \textbf{Classification Metrics:} Macro-averaged F1-Score, Precision, and Recall, providing a holistic view of model performance on imbalanced datasets. + \item \textbf{Hardware Diagnostics:} CPU temperature, memory usage, and utilization are logged alongside training progress, aiding in the detection of thermal throttling or memory leaks during long training runs. + \end{itemize} + + \item \textbf{Efficient Data Sampling:} To manage large datasets efficiently, the loader utilizes a \texttt{SamplerDataset}. This limits the effective epoch size to 50,000 training samples and 5,000 validation samples, allowing for rapid iteration and feedback loops without needing to process the entire corpus in every epoch. +\end{itemize} + +\subsection{Rust: Transformer-Based Text Classification Model Performance} + +The text classification model used in this experiment was based on a Transformer encoder architecture. The model consisted of token embeddings, positional embeddings, a multi-layer Transformer encoder, and a final linear classification layer. + +\subsubsection{Model Architecture} + +The architecture of the model is shown below: + +\begin{lstlisting} +TextClassificationModel { + transformer: TransformerEncoder { + d_model: 256, + d_ff: 1024, + n_heads: 8, + n_layers: 4, + dropout: 0.1, + norm_first: true, + quiet_softmax: true, + params: 3159040 + } + embedding_token: Embedding { + n_embedding: 28996, + d_model: 256, + params: 7422976 + } + embedding_pos: Embedding { + n_embedding: 256, + d_model: 256, + params: 65536 + } + output: Linear { + d_input: 256, + d_output: 4, + bias: true, + params: 1028 + } + n_classes: 4 + params: 10648580 +} +\end{lstlisting} + +The Transformer encoder used four encoder layers with eight attention heads per layer. Each layer had a model dimension of 256 and a feed-forward dimension of 1024. A dropout rate of 0.1 was used to reduce overfitting. + +The token embedding layer mapped a vocabulary of 28,996 tokens into 256-dimensional vectors. Positional embeddings of length 256 were also used so that the Transformer could capture token order information. + +The final output layer mapped the Transformer representation into four output classes. + +The total number of trainable parameters in the model was 10,648,580. + +\subsubsection{Training Configuration} + +The model was trained for a total of 5 epochs. During training, the learning rate decayed from $1.107 \times 10^{-5}$ in the first epoch to $3.733 \times 10^{-6}$ in the final epoch. + +\subsubsection{Training Performance} + +Training accuracy improved steadily from 57.968\% in the first epoch to 81.474\% in the fifth epoch. Similarly, the training loss decreased from 0.981 to 0.507. + +The macro precision, recall, and F1-score also improved significantly during training: + +\begin{itemize} + \item Precision increased from 58.893\% to 81.606\% + \item Recall increased from 57.741\% to 81.334\% + \item F1-score increased from 50.201\% to 76.001\% +\end{itemize} + +These results indicate that the model learned meaningful semantic patterns in the text data over successive epochs. + +\subsubsection{Validation Performance} + +Validation performance also improved consistently. Validation accuracy increased from 72.280\% in the first epoch to a maximum of 81.640\% in the fourth epoch. + +The validation loss decreased from 0.731 to 0.507, showing that the model generalized reasonably well to unseen data. + +The validation precision, recall, and F1-score also showed strong improvement: + +\begin{itemize} + \item Precision increased from 72.230\% to 81.739\% + \item Recall increased from 72.258\% to 82.039\% + \item F1-score increased from 65.796\% to 76.509\% +\end{itemize} + +The relatively close values between training and validation accuracy suggest that the model did not suffer from severe overfitting. + +\begin{table*}[t!] +\centering +\caption{Training and Validation Metrics Summary for the Transformer-Based Text Classification Model} +% @kumar please layout sahi kar sakta hai? love you +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min.} & \textbf{Epoch} & \textbf{Max.} & \textbf{Epoch} \\ +\hline +Train & Accuracy & 57.968 & 1 & 81.474 & 5 \\ +Train & Loss & 0.507 & 5 & 0.981 & 1 \\ +Train & Precision@Top1 [Macro] & 58.893 & 1 & 81.606 & 5 \\ +Train & Recall@Top1 [Macro] & 57.741 & 1 & 81.334 & 5 \\ +Train & F1-Score@Top1 [Macro] & 50.201 & 1 & 76.001 & 5 \\ +Train & Learning Rate & $3.733 \times 10^{-6}$ & 5 & $1.107 \times 10^{-5}$ & 1 \\ +Train & CPU Memory (GB) & 2.401 & 2 & 2.736 & 5 \\ +Train & CPU Usage (\%) & 16.529 & 1 & 17.160 & 5 \\ +\hline +Valid & Accuracy & 72.280 & 1 & 81.640 & 4 \\ +Valid & Loss & 0.507 & 5 & 0.731 & 1 \\ +Valid & Precision@Top1 [Macro] & 72.230 & 1 & 81.739 & 5 \\ +Valid & Recall@Top1 [Macro] & 72.258 & 1 & 82.039 & 5 \\ +Valid & F1-Score@Top1 [Macro] & 65.796 & 1 & 76.509 & 5 \\ +Valid & CPU Memory (GB) & 2.263 & 1 & 2.747 & 5 \\ +Valid & CPU Usage (\%) & 20.331 & 1 & 22.190 & 3 \\ +\hline +\end{tabular} +\label{tab:transformer_text_classification_metrics} +\end{table*} + +\subsubsection{Resource Utilization} + +The CPU memory usage remained relatively stable throughout training. Training memory usage ranged from 2.401 GB to 2.736 GB, while validation memory usage ranged from 2.263 GB to 2.747 GB. + +CPU utilization also remained moderate, with training CPU usage ranging from 16.529\% to 17.160\% and validation CPU usage ranging from 20.331\% to 22.190\%. + +CPU temperature values were unavailable during the experiment and therefore recorded as NaN. + +\subsubsection{Execution Time and Failure} + +The complete training run required: + +\begin{itemize} + \item Real time: 32 minutes and 37.872 seconds + \item User CPU time: 36 minutes and 52.313 seconds + \item System CPU time: 1 minute and 41.258 seconds +\end{itemize} + +\subsection{PyTorch Training Pipeline} +This section details the Python implementation of the text classification training pipeline. The code mimics the architecture and logic of the Rust version to ensuring comparable performance and behavior. +\subsubsection{Code Highlights} +\begin{itemize} + \item \textbf{Custom Transformer Model:} + The \texttt{TextClassificationModel} is a custom \texttt{nn.Module} containing: + \begin{itemize} + \item Dual embedding layers (\texttt{embedding\_token} and \texttt{embedding\_pos}). + \item A unique fusion strategy averaging the two embeddings: $E = (E_{pos} + E_{tok}) / 2$. + \item A standard \texttt{TransformerEncoder} stack. + \item A classification head that projects the encoded features to the 4 output classes of the AG News dataset. + \end{itemize} + \item \textbf{Noam Learning Rate Scheduler:} + A custom \texttt{NoamLR} scheduler is implemented to replicate the specific warmup and decay behavior used in the Rust implementation (and the original "Attention Is All You Need" paper). + \[ + lr = \text{factor} \cdot (d_{model}^{-0.5}) \cdot \min(step^{-0.5}, step \cdot warmup^{-1.5}) + \] + This ensures stable training dynamics for the Transformer architecture. + \item \textbf{Dataset Handling:} + The code utilizes the Hugging Face \texttt{datasets} library to load the "ag\_news" dataset. It explicitly shuffles and subsets the data (50,000 train, 5,000 test) to match the constraints applied in the Rust implementation, ensuring a fair apples-to-apples comparison between the two languages. + \item \textbf{Collate Function with Padding Masks:} + A custom \texttt{collate\_fn} handles dynamic batching. It tokenizes text using the \texttt{bert-base-cased} tokenizer and generates a boolean padding mask. Note the inversion logic: PyTorch's \texttt{TransformerEncoder} expects \texttt{True} for padded positions (unlike some other implementations where 1 implies validity), requiring careful mask generation: + \begin{lstlisting} + mask_pad = (encoding['attention_mask'] == 0) + \end{lstlisting} + \item \textbf{Training Loop:} + The training loop is a standard PyTorch implementation using \texttt{tqdm} for progress tracking. It uses \texttt{CrossEntropyLoss} as the criterion and the \texttt{Adam} optimizer. Crucially, the scheduler step is called after every batch (not every epoch), consistent with the Noam schedule requirements. +\end{itemize} + + +\subsection{Python: Text Classification (News) Transformer Model} + +The model is a Transformer-based architecture designed for multi-class news classification. It consists of a multi-layer encoder with multi-head self-attention and feedforward networks. + +\textbf{Model Architecture:} + +\begin{lstlisting} +Model { + transformer_encoder: { + d_model: 256, + nhead: 8, + num_layers: 4, + dim_feedforward: 1024 + } + max_seq_len: 256 + num_classes: 4 + total params: 10649092 +} +\end{lstlisting} + +The model was trained for 5 epochs and showed steady convergence across all evaluation metrics. + +Training accuracy improved from 56.19\% to 79.70\%, while validation accuracy increased from 68.14\% to 79.00\%. +Training loss decreased from 1.0145 to 0.5483, and validation loss reduced from 0.8137 to 0.5628. + +The model achieved a macro F1-score of 0.7903 on the validation/test set, indicating reasonably strong classification performance for a Transformer trained over a small number of epochs. + + + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 706.99 seconds + \item Average Epoch Time: 139.83 seconds + \item Iteration Speed (Mean): 44.70 it/s + \item Gradient Norm (Mean): 15.75 + \item GPU Memory Usage: 178.79 MB + \item NaN Events: 0 + \item Convergence: Monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + +The results indicate stable training and consistent improvement across epochs. While performance is lower than simpler CNN-based tasks, this is expected due to the increased complexity of natural language understanding tasks. + + +\section{Task: LSTM implementation} + +\subsection{Introduction} +This document outlines the detailed architectural, mathematical, and translational specifics of implementing a Long Short-Term Memory (LSTM) model across two prominent machine learning environments: Rust (using the Burn framework) and Python (using PyTorch). It covers the model architecture, training pipelines, specialized deployment techniques using network filesystems (NFS) with Docker, and language-specific design implications. + +\subsection{Model Architecture and Mathematical Formulation} + +\subsubsection{Mathematical Foundation of the LSTM Cell} +The core of the model revolves around a custom, manually-implemented LSTM cell. Instead of relying on the standard un-inspectable black-box LSTM implementations provided by typical ML libraries, both codebases explicitly define the cell-level math. + +For a given timestep $t$, the input tensor $x_t$ and the previous hidden state $h_{t-1}$ are used to compute the various gates. The mathematical formulation utilized is: +\begin{align} + f_t &= \sigma(W_f \cdot [h_{t-1}, x_t] + b_f) \quad &\text{(Forget Gate)} \\ + i_t &= \sigma(W_i \cdot [h_{t-1}, x_t] + b_i) \quad &\text{(Input Gate)} \\ + g_t &= \tanh(W_g \cdot [h_{t-1}, x_t] + b_g) \quad &\text{(Candidate State)} \\ + o_t &= \sigma(W_o \cdot [h_{t-1}, x_t] + b_o) \quad &\text{(Output Gate)} +\end{align} +\begin{align} + c_t &= f_t \odot c_{t-1} + i_t \odot g_t \quad &\text{(New Cell State)} \\ + h_t &= o_t \odot \tanh(c_t) \quad &\text{(New Hidden State)} +\end{align} + +Where: +\begin{itemize} + \item $\sigma$ represents the Sigmoid activation function. + \item $\tanh$ represents the Hyperbolic Tangent activation function. + \item $\odot$ denotes element-wise multiplication (Hadamard product). + \item $[h_{t-1}, x_t]$ symbolizes the concatenation of the previous hidden state and the current input. +\end{itemize} + +\subsubsection{Architectural Details} +Both implementations adhere strictly to the following architectural design: +\begin{enumerate} + \item \textbf{Layer Normalization:} Pre-activation gates, the cell state ($c_t$), and the hidden state ($h_t$) pass through separate \texttt{LayerNorm} layers. This design choice stabilizes training dynamics since the feature distributions inside the LSTM evolve at every sequence step (making standard Batch Normalization ineffective). + \item \textbf{Optimized Gate Compute:} Instead of computing 4 separate linear transformations per timestep for the features, the model employs a single combined projection that outputs a $4 \times \text{hidden\_size}$ tensor. This tensor is subsequently split into four chunks corresponding to the $i, f, g$, and $o$ gates. + \item \textbf{Bidirectional Support:} An encapsulated \texttt{StackedLstm} module stacks multiple manual LSTM layers (applying dropout between layers except on the final one). The main \texttt{LstmNetwork} integrates a forward processing stack and an optional backward processing stack (which flips the temporal dimension of the input sequence). Their respective output hidden states are concatenated along the feature dimension before passing through a fully-connected projection head. + \item \textbf{Initialization bias:} The forget-gate bias parameters are explicitly initialized to $1.0$ (via Xavier Normal parameter slicing) to prevent fatal early-training gradient decay. +\end{enumerate} + + +\subsection{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\begin{figure}[H] +\centering +\includegraphics[width=\linewidth]{RustLocust/LSTM.pdf} +\caption{Rust Backend Load Testing Dashboard for LSTM} +\end{figure} + +\subsection{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\begin{figure}[H] +\centering +\includegraphics[width=\linewidth]{PythonLocust/LSTM.pdf} +\caption{Python Backend Load Testing Dashboard for LSTM} +\end{figure} + +\subsection{Training Pipeline} +The training behavior is intentionally synchronized to ensure parity between the languages: +\begin{itemize} + \item \textbf{Data Loading:} Operates synchronously on synthetically generated noisy sequential datasets. The validation set is scaled symmetrically relative to the training set ($20\%$ of training size). + \item \textbf{Optimization Algorithm:} Utilizes the Adam Optimizer. + \item \textbf{Loss Function:} Mean Squared Error (MSE), with reduction set to \textit{mean}. Both explicitly weigh loss accumulation during epoch passes by scaling local batch losses by the discrete batch size, averaging properly at the conclusion of the epoch. + \item \textbf{Gradient Clipping:} Ensures numerical stability on longer sequence inputs. The gradient norm is strictly clipped to $\max = 1.0$ right before the optimizer steps. + \item \textbf{Artifacts Output:} Training scripts generate an \texttt{artifact\_dir} where they store a \texttt{config.json} representation of hyperparameters, and the full state dictionary (\texttt{model.pt} in PyTorch; CompactRecorder files in Rust Burn). +\end{itemize} + +\subsection{Inference Pipeline and Docker NFS Integration} +\subsubsection{PyTorch Inference Architecture} +A critical requirement for modern PyTorch inference deployments is resolving the massive disk footprint of CUDA-enabled PyTorch backend libraries. The PyTorch pipeline employs a sophisticated Network File System (NFS) logic to achieve a highly optimized, lightweight Dockerized inference deployment: +\begin{enumerate} + \item \textbf{External Library Mounting:} A host-level script (\texttt{mount\_libs.sh}) maps an external NAS/NFS storage partition (from \texttt{172.16.203.14}) loaded with Python environments targeting \texttt{/mnt/LSTM-libs}. + \item \textbf{Optimized Dockerfile:} The image leverages the \texttt{nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04} base image and installs basic \texttt{python3.11} runtime headers without calling \texttt{pip install torch}. Thus, the final image size is structurally negligible compared to standard ml-images. + \item \textbf{Runtime Binding:} The inference container bootloader scripts (\texttt{run\_container.sh}) bind these volume mounts (\texttt{-v \$NFS\_MOUNT\_POINT:/external-libs}) and crucially overrides the \texttt{PYTHONPATH} env-variable: + \begin{lstlisting} + -e PYTHONPATH="$CONTAINER_LIB_MOUNT/LSTM_env/lib/.../site-packages" + \end{lstlisting} + \item \textbf{Inference Execution:} \texttt{app.py} loads the model weights off an abstracted configuration path, builds a zero-gradient loader, runs inference iteratively over a single collapsed batch, and yields predictions natively. +\end{enumerate} + +\subsubsection{Rust Inference Architecture} +Rust's inference pipeline diverges significantly regarding deployment complexity due to compilation structures: +\begin{itemize} + \item \textbf{Stateless Binaries:} No containerized runtime libraries are mandated because Burn compiles statically down to heavily optimized binaries, pulling model states directly via the \texttt{CompactRecorder}. + \item \textbf{Visualization:} Results are mapped into native Polars \texttt{DataFrame} objects (\texttt{df![]}) rendering lightweight native tables detailing \textit{expected targets} versus \textit{computed predictions}. +\end{itemize} + +\subsection{Implementation Specifics} +\subsubsection{PyTorch Specific Constraints} +\begin{itemize} + \item \textbf{Dynamic computation graphing:} The \texttt{model.py} cleanly slices and chunks gates natively on tensors (e.g., \texttt{gates.chunk(4, dim=1)}). + \item \textbf{Sequence Reversals:} Done programmatically via continuous \texttt{Tensor.flip(dims=[1])} which mandates that tensors must remain contiguously stored within PyTorch internals to avoid memory reallocation overhead. + \item \textbf{Seed Setting API:} Requires deterministic locking across four sub-systems (\texttt{random, numpy, torch, torch.cuda}) to match Rust's reproducibility parameters. +\end{itemize} + +\subsubsection{Rust (Burn) Specific Constraints} +\begin{itemize} + \item \textbf{Compile-Time Dimension Types:} Rust explicitly binds Tensor dimensionality at compile time (\texttt{Tensor} vs \texttt{Tensor}). This offers un-matched safety by forbidding invalid dimension injections that PyTorch would crash on dynamically. + \item \textbf{Trait Encapsulation:} Leverages explicit trait architectures (\texttt{\#[derive(Module, Config)]}) that automate saving hyperparameters and generating gradient backends. Burn models must be mapped cleanly from standard states to \texttt{autodiff} states. + \item \textbf{No-Mutation Logic:} State mutations generated sequentially in LSTMs are represented safely utilizing explicit tuple destructuring via \texttt{LstmState\{hidden, cell\}}, bypassing complex internal pointer tracking. + \item \textbf{Explicit Initialization Handling:} Since Burn limits orthogonal initializes out-of-the-box, Xavier Normalization was invoked explicitly, paired with \texttt{slice\_assign} tensor mappings to safely load the 1.0 uniform fill into the forget-gate components. +\end{itemize} + +\subsection{Rust: Training Loss Progression and Model Convergence} + +The model was trained for a total of 30 epochs. During training, both the training loss and validation loss decreased substantially, indicating that the model was able to learn meaningful patterns from the data. + +\subsubsection{Training Progress} + +The training process began with relatively high loss values. However, as training progressed, both the average training loss and average validation loss consistently decreased. + +The recorded loss values at different stages of training are shown below: + + + +\subsubsection{Loss Trend Analysis} + +The training loss decreased from 4456.9658 at epoch 5 to 52.1122 at epoch 30. Similarly, the validation loss decreased from 4473.4448 to 17.5850 over the same period. + +This large reduction in both training and validation loss suggests that the model successfully converged during training. + +Although the training loss slightly increased between epoch 25 and epoch 30, the validation loss continued to decrease. This indicates that the model continued to improve its ability to generalize to unseen data. + +The lowest validation loss achieved during the experiment was: + +\[ +17.5850 +\] + +at epoch 30. + +\subsubsection{Generalization Performance} + +The close alignment between the training loss and validation loss throughout training suggests that the model did not suffer from severe overfitting. + +In the earlier epochs, both losses were very high, which is expected because the model parameters were still being optimized. As training continued, the losses dropped rapidly, especially between epochs 10 and 25. + +This behavior indicates that the model learned most of its predictive capability during the middle phase of training. + +\subsubsection{Execution Time} + +The complete training process required: + +\begin{itemize} + \item Real time: 6 minutes and 42.185 seconds + \item User CPU time: 6 minutes and 42.414 seconds + \item System CPU time: 3.49 seconds +\end{itemize} + +The relatively low system CPU time compared to user CPU time suggests that most of the runtime was spent performing model computation rather than operating system overhead. +\subsection{Python: LSTM Model Architecture and Training Performance} + +The Long Short-Term Memory (LSTM) model consists of a 2-layer bidirectional LSTM followed by a fully connected output layer. Dropout is applied between LSTM layers to improve generalization. + +\textbf{Model Architecture:} + + +The model was trained for 30 epochs. Training and validation performance improved significantly and consistently. + +Training loss decreased from 5543.18 to 56.11 (98.99\% reduction), while validation loss decreased from 5699.26 to 48.92. +The model achieved strong regression performance, with validation RMSE reaching 6.99 and MAE reaching 4.33. + +The $R^2$ score improved from negative values to 0.9517, indicating strong predictive capability. + + + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 158.63 seconds + \item Average Epoch Time: 5.18 seconds + \item Iteration Speed (Mean): 6.22 it/s + \item Gradient Norm (Mean): 1394.59 + \item NaN Events: 0 + \item Convergence: Monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + + + + +\section{Overall Evaluation Summary} + +\subsection{Container Comparison: Python vs Rust} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 3.98GB & 974MB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table*} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + + +\subsection{Model Size Comparison} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table*} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + +\EOD +\end{document} \ No newline at end of file diff --git a/latex_reports/lstm.tex b/latex_reports/lstm.tex new file mode 100644 index 0000000..fcf3814 --- /dev/null +++ b/latex_reports/lstm.tex @@ -0,0 +1,326 @@ +\documentclass[12pt]{article} +\usepackage{amsmath} +\usepackage{amssymb} +\usepackage{graphicx} +\usepackage{hyperref} +\usepackage{minted} +\usepackage{geometry} +\geometry{a4paper, margin=1in} +\usepackage{pdfpages} + + +\title{\textbf{Comparative Analysis of LSTM Implementation: Rust (Burn) vs. PyTorch}} +\author{Technical Report} +\date{\today} + +\begin{document} + +\maketitle +\tableofcontents +\newpage + +\section{Introduction} +This document outlines the detailed architectural, mathematical, and translational specifics of implementing a Long Short-Term Memory (LSTM) model across two prominent machine learning environments: Rust (using the Burn framework) and Python (using PyTorch). It covers the model architecture, training pipelines, specialized deployment techniques using network filesystems (NFS) with Docker, and language-specific design implications. + +\section{Model Architecture and Mathematical Formulation} + +\subsection{Mathematical Foundation of the LSTM Cell} +The core of the model revolves around a custom, manually-implemented LSTM cell. Instead of relying on the standard un-inspectable black-box LSTM implementations provided by typical ML libraries, both codebases explicitly define the cell-level math. + +For a given timestep $t$, the input tensor $x_t$ and the previous hidden state $h_{t-1}$ are used to compute the various gates. The mathematical formulation utilized is: +\begin{align} + f_t &= \sigma(W_f \cdot [h_{t-1}, x_t] + b_f) \quad &\text{(Forget Gate)} \\ + i_t &= \sigma(W_i \cdot [h_{t-1}, x_t] + b_i) \quad &\text{(Input Gate)} \\ + g_t &= \tanh(W_g \cdot [h_{t-1}, x_t] + b_g) \quad &\text{(Candidate State)} \\ + o_t &= \sigma(W_o \cdot [h_{t-1}, x_t] + b_o) \quad &\text{(Output Gate)} +\end{align} +\begin{align} + c_t &= f_t \odot c_{t-1} + i_t \odot g_t \quad &\text{(New Cell State)} \\ + h_t &= o_t \odot \tanh(c_t) \quad &\text{(New Hidden State)} +\end{align} + +Where: +\begin{itemize} + \item $\sigma$ represents the Sigmoid activation function. + \item $\tanh$ represents the Hyperbolic Tangent activation function. + \item $\odot$ denotes element-wise multiplication (Hadamard product). + \item $[h_{t-1}, x_t]$ symbolizes the concatenation of the previous hidden state and the current input. +\end{itemize} + +\subsection{Architectural Details} +Both implementations adhere strictly to the following architectural design: +\begin{enumerate} + \item \textbf{Layer Normalization:} Pre-activation gates, the cell state ($c_t$), and the hidden state ($h_t$) pass through separate \texttt{LayerNorm} layers. This design choice stabilizes training dynamics since the feature distributions inside the LSTM evolve at every sequence step (making standard Batch Normalization ineffective). + \item \textbf{Optimized Gate Compute:} Instead of computing 4 separate linear transformations per timestep for the features, the model employs a single combined projection that outputs a $4 \times \text{hidden\_size}$ tensor. This tensor is subsequently split into four chunks corresponding to the $i, f, g$, and $o$ gates. + \item \textbf{Bidirectional Support:} An encapsulated \texttt{StackedLstm} module stacks multiple manual LSTM layers (applying dropout between layers except on the final one). The main \texttt{LstmNetwork} integrates a forward processing stack and an optional backward processing stack (which flips the temporal dimension of the input sequence). Their respective output hidden states are concatenated along the feature dimension before passing through a fully-connected projection head. + \item \textbf{Initialization bias:} The forget-gate bias parameters are explicitly initialized to $1.0$ (via Xavier Normal parameter slicing) to prevent fatal early-training gradient decay. +\end{enumerate} + + +\section{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{RustLocust/LSTM.pdf} + + + +\section{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{PythonLocust/LSTM.pdf} + +\section{Training Pipeline} +The training behavior is intentionally synchronized to ensure parity between the languages: +\begin{itemize} + \item \textbf{Data Loading:} Operates synchronously on synthetically generated noisy sequential datasets. The validation set is scaled symmetrically relative to the training set ($20\%$ of training size). + \item \textbf{Optimization Algorithm:} Utilizes the Adam Optimizer. + \item \textbf{Loss Function:} Mean Squared Error (MSE), with reduction set to \textit{mean}. Both explicitly weigh loss accumulation during epoch passes by scaling local batch losses by the discrete batch size, averaging properly at the conclusion of the epoch. + \item \textbf{Gradient Clipping:} Ensures numerical stability on longer sequence inputs. The gradient norm is strictly clipped to $\max = 1.0$ right before the optimizer steps. + \item \textbf{Artifacts Output:} Training scripts generate an \texttt{artifact\_dir} where they store a \texttt{config.json} representation of hyperparameters, and the full state dictionary (\texttt{model.pt} in PyTorch; CompactRecorder files in Rust Burn). +\end{itemize} + +\section{Inference Pipeline and Docker NFS Integration} +\subsection{PyTorch Inference Architecture} +A critical requirement for modern PyTorch inference deployments is resolving the massive disk footprint of CUDA-enabled PyTorch backend libraries. The PyTorch pipeline employs a sophisticated Network File System (NFS) logic to achieve a highly optimized, lightweight Dockerized inference deployment: +\begin{enumerate} + \item \textbf{External Library Mounting:} A host-level script (\texttt{mount\_libs.sh}) maps an external NAS/NFS storage partition (from \texttt{172.16.203.14}) loaded with Python environments targeting \texttt{/mnt/LSTM-libs}. + \item \textbf{Optimized Dockerfile:} The image leverages the \texttt{nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04} base image and installs basic \texttt{python3.11} runtime headers without calling \texttt{pip install torch}. Thus, the final image size is structurally negligible compared to standard ml-images. + \item \textbf{Runtime Binding:} The inference container bootloader scripts (\texttt{run\_container.sh}) bind these volume mounts (\texttt{-v \$NFS\_MOUNT\_POINT:/external-libs}) and crucially overrides the \texttt{PYTHONPATH} env-variable: + \begin{verbatim} + -e PYTHONPATH="$CONTAINER_LIB_MOUNT/LSTM_env/lib/.../site-packages" + \end{verbatim} + \item \textbf{Inference Execution:} \texttt{app.py} loads the model weights off an abstracted configuration path, builds a zero-gradient loader, runs inference iteratively over a single collapsed batch, and yields predictions natively. +\end{enumerate} + +\subsection{Rust Inference Architecture} +Rust's inference pipeline diverges significantly regarding deployment complexity due to compilation structures: +\begin{itemize} + \item \textbf{Stateless Binaries:} No containerized runtime libraries are mandated because Burn compiles statically down to heavily optimized binaries, pulling model states directly via the \texttt{CompactRecorder}. + \item \textbf{Visualization:} Results are mapped into native Polars \texttt{DataFrame} objects (\texttt{df![]}) rendering lightweight native tables detailing \textit{expected targets} versus \textit{computed predictions}. +\end{itemize} + +\section{Implementation Specifics} +\subsection{PyTorch Specific Constraints} +\begin{itemize} + \item \textbf{Dynamic computation graphing:} The \texttt{model.py} cleanly slices and chunks gates natively on tensors (e.g., \texttt{gates.chunk(4, dim=1)}). + \item \textbf{Sequence Reversals:} Done programmatically via continuous \texttt{Tensor.flip(dims=[1])} which mandates that tensors must remain contiguously stored within PyTorch internals to avoid memory reallocation overhead. + \item \textbf{Seed Setting API:} Requires deterministic locking across four sub-systems (\texttt{random, numpy, torch, torch.cuda}) to match Rust's reproducibility parameters. +\end{itemize} + +\subsection{Rust (Burn) Specific Constraints} +\begin{itemize} + \item \textbf{Compile-Time Dimension Types:} Rust explicitly binds Tensor dimensionality at compile time (\texttt{Tensor} vs \texttt{Tensor}). This offers un-matched safety by forbidding invalid dimension injections that PyTorch would crash on dynamically. + \item \textbf{Trait Encapsulation:} Leverages explicit trait architectures (\texttt{\#[derive(Module, Config)]}) that automate saving hyperparameters and generating gradient backends. Burn models must be mapped cleanly from standard states to \texttt{autodiff} states. + \item \textbf{No-Mutation Logic:} State mutations generated sequentially in LSTMs are represented safely utilizing explicit tuple destructuring via \texttt{LstmState\{hidden, cell\}}, bypassing complex internal pointer tracking. + \item \textbf{Explicit Initialization Handling:} Since Burn limits orthogonal initializes out-of-the-box, Xavier Normalization was invoked explicitly, paired with \texttt{slice\_assign} tensor mappings to safely load the 1.0 uniform fill into the forget-gate components. +\end{itemize} + +\section{Rust: Training Loss Progression and Model Convergence} + +The model was trained for a total of 30 epochs. During training, both the training loss and validation loss decreased substantially, indicating that the model was able to learn meaningful patterns from the data. + +\subsection{Training Progress} + +The training process began with relatively high loss values. However, as training progressed, both the average training loss and average validation loss consistently decreased. + +The recorded loss values at different stages of training are shown below: + +\begin{table}[h] +\centering +\caption{Training and Validation Loss Progression} +\begin{tabular}{|c|c|c|} +\hline +\textbf{Epoch} & \textbf{Average Training Loss} & \textbf{Average Validation Loss} \\ +\hline +5 & 4456.9658 & 4473.4448 \\ +10 & 2510.1016 & 2438.3970 \\ +15 & 900.7457 & 801.6573 \\ +20 & 154.4127 & 164.8311 \\ +25 & 48.2149 & 20.1441 \\ +30 & 52.1122 & 17.5850 \\ +\hline +\end{tabular} +\label{tab:loss_progression} +\end{table} + +\subsection{Loss Trend Analysis} + +The training loss decreased from 4456.9658 at epoch 5 to 52.1122 at epoch 30. Similarly, the validation loss decreased from 4473.4448 to 17.5850 over the same period. + +This large reduction in both training and validation loss suggests that the model successfully converged during training. + +Although the training loss slightly increased between epoch 25 and epoch 30, the validation loss continued to decrease. This indicates that the model continued to improve its ability to generalize to unseen data. + +The lowest validation loss achieved during the experiment was: + +\[ +17.5850 +\] + +at epoch 30. + +\subsection{Generalization Performance} + +The close alignment between the training loss and validation loss throughout training suggests that the model did not suffer from severe overfitting. + +In the earlier epochs, both losses were very high, which is expected because the model parameters were still being optimized. As training continued, the losses dropped rapidly, especially between epochs 10 and 25. + +This behavior indicates that the model learned most of its predictive capability during the middle phase of training. + +\subsection{Execution Time} + +The complete training process required: + +\begin{itemize} + \item Real time: 6 minutes and 42.185 seconds + \item User CPU time: 6 minutes and 42.414 seconds + \item System CPU time: 3.49 seconds +\end{itemize} + +The relatively low system CPU time compared to user CPU time suggests that most of the runtime was spent performing model computation rather than operating system overhead. +\section{Python: LSTM Model Architecture and Training Performance} + +The Long Short-Term Memory (LSTM) model consists of a 2-layer bidirectional LSTM followed by a fully connected output layer. Dropout is applied between LSTM layers to improve generalization. + +\textbf{Model Architecture:} + + +The model was trained for 30 epochs. Training and validation performance improved significantly and consistently. + +Training loss decreased from 5543.18 to 56.11 (98.99\% reduction), while validation loss decreased from 5699.26 to 48.92. +The model achieved strong regression performance, with validation RMSE reaching 6.99 and MAE reaching 4.33. + +The $R^2$ score improved from negative values to 0.9517, indicating strong predictive capability. + +\begin{table}[h] +\centering +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min} & \textbf{Epoch} & \textbf{Max} & \textbf{Epoch} \\ +\hline + +Train & Loss & 56.11 & 30 & 5543.18 & 1 \\ +Train & RMSE & 7.49 & 30 & 74.45 & 1 \\ +Train & MAE & 5.03 & 30 & 67.25 & 1 \\ +Train & R$^2$ & -4.41 & 1 & 0.9452 & 30 \\ +Train & Grad Norm (Total) & 524.61 & 1 & 4972.41 & 24 \\ +Train & Iteration Speed (it/s) & 5.02 & 29 & 6.94 & 4 \\ +Train & CPU Memory (GB) & 0.87 & -- & 1.13 & -- \\ +Train & CPU Usage (\%) & 72.4 & -- & 88.8 & -- \\ +\hline + +Valid & Loss & 47.96 & 29 & 5699.26 & 1 \\ +Valid & RMSE & 6.93 & 29 & 75.49 & 1 \\ +Valid & MAE & 3.54 & 28 & 68.51 & 1 \\ +Valid & R$^2$ & -4.63 & 1 & 0.9526 & 29 \\ +Valid & Iteration Speed (it/s) & 5.02 & 29 & 6.94 & 4 \\ +Valid & CPU Memory (GB) & 0.87 & -- & 1.13 & -- \\ +Valid & CPU Usage (\%) & 72.4 & -- & 88.8 & -- \\ +\hline + +\end{tabular} +\caption{Extended LSTM Training and Validation Metrics Summary} +\end{table} + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 158.63 seconds + \item Average Epoch Time: 5.18 seconds + \item Iteration Speed (Mean): 6.22 it/s + \item Gradient Norm (Mean): 1394.59 + \item NaN Events: 0 + \item Convergence: Monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + + + + +\newpage + +\section{Container Comparison: Python vs Rust} + +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 3.98GB & 974MB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + + +\section{Model Size Comparison} + +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + + +\end{document} diff --git a/latex_reports/main.tex b/latex_reports/main.tex new file mode 100644 index 0000000..1c10156 --- /dev/null +++ b/latex_reports/main.tex @@ -0,0 +1,227 @@ +\documentclass[11pt,a4paper]{article} + +\usepackage{geometry} +\geometry{margin=1in} + +\usepackage{amsmath,amssymb} +\usepackage{graphicx} +\usepackage{booktabs} +\usepackage{hyperref} +\usepackage{enumitem} +\usepackage{array} +\usepackage{float} + +\title{\textbf{Progress Report: System-Level Evaluation of Rust and Python for Machine Learning}} +\author{Project Elective} +\date{\today} + +\begin{document} + +\maketitle + +\section{Overview of the Project} + +This project studies the use of \textbf{Rust} as an alternative systems language for machine learning workflows traditionally implemented in \textbf{Python}. +Rather than focusing on state-of-the-art model performance, the emphasis is on: + +\begin{itemize} + \item feasibility of end-to-end ML workflows, + \item system stability and reproducibility, + \item developer experience and DevOps complexity, + \item deployment and operational characteristics. +\end{itemize} + +To ensure clarity and rigor, the work is organized into \textbf{two clearly separated experimental tracks}. + +--- + +\section{Project Structure: Two-Track Evaluation} + +The project consists of the following two tracks: + +\subsection*{Track 1: Training-Based Systems Evaluation} +This track compares \textbf{machine learning training pipelines} implemented in: +\begin{itemize} + \item PyTorch (Python), and + \item Burn (Rust). +\end{itemize} + +The goal is to evaluate training feasibility, stability, compile-time guarantees, and DevOps impact, rather than raw training speed. + +\subsection*{Track 2: Inference-Based DevOps Evaluation} +This track compares \textbf{production-style inference services} implemented in: +\begin{itemize} + \item Python-based ONNX inference, and + \item Rust-based ONNX inference. +\end{itemize} + +The focus is on deployment, security, containerization, CI/CD behavior, and runtime efficiency. + +Each track is designed to answer a distinct research question while remaining complementary. + +--- + +\section{Machine Learning Tasks Considered} + +To ensure coverage of diverse ML workloads, the following tasks are identified: + +\begin{itemize} + \item \textbf{Text Classification}: Dataset to be finalized. + \item \textbf{Image Classification}: MNIST dataset. + \item \textbf{Credit Score Assignment}: Supervised classification task. + \item \textbf{Multi-Objective Machine Learning}: Brain Tumor dataset with a MOML formulation. + \item \textbf{Fine-Tuning Task}: BERT-based classification (ANLP Assignment 1), with optional LoRA / QLoRA. + \item \textbf{Autoregressive Decoding}: Experiments using the Burn framework. +\end{itemize} + +At the current stage, the \textbf{MNIST image classification task has been fully implemented}. +The corresponding training code is available in the project GitHub repository. + +--- + +\section{Related Work} + +The following research papers are being used to guide experimental design and evaluation: + +\begin{itemize} + \item \url{https://ieeexplore.ieee.org/document/11126113} + \item \url{https://ieeexplore.ieee.org/document/11261485} + \item \url{https://ieeexplore.ieee.org/document/11212348} + \item \url{https://www.ijsred.com/volume8/issue2/IJSRED-V8I2P143.pdf} +\end{itemize} + +--- + +\section{Code Repository and Current Status} + +Project repository: +\begin{center} +\url{https://github.com/Abhinav-Kumar012/Rust_Python_ML_PE.git} +\end{center} + +Current progress includes: +\begin{itemize} + \item MNIST training pipeline implemented + \item PyTorch baseline established + \item Initial Rust (Burn) training setup completed +\end{itemize} + +--- + +\section{Track 1: Training-Based Systems Evaluation} + +\subsection{Objective} + +The objective of this track is to answer the following research question: + +\begin{quote} +\textit{Can Rust realistically support end-to-end machine learning training pipelines, and what system-level trade-offs does this introduce compared to PyTorch?} +\end{quote} + +This track explicitly avoids speed-centric benchmarking and instead focuses on system behavior. + +--- + +\subsection{Frameworks Compared} + +\subsubsection{PyTorch (Baseline)} +\begin{itemize} + \item Language: Python + \item Training maturity: Very high + \item Ecosystem: Extensive +\end{itemize} + +\subsubsection{Rust (Burn)} +\begin{itemize} + \item Language: Rust + \item Training maturity: Emerging + \item Design: Idiomatic Rust, native training support +\end{itemize} + +--- + +\subsection{Experimental Controls} + +\textbf{Fixed Across Both Implementations} +\begin{itemize} + \item Dataset splits + \item Number of epochs + \item Batch size + \item Optimizer type + \item Learning rate + \item Hardware +\end{itemize} + +\textbf{Allowed Differences} +\begin{itemize} + \item Internal kernel implementations + \item Graph execution model + \item Memory management +\end{itemize} + +--- + +\subsection{Metrics Collected} + +\begin{itemize} + \item Training time per epoch (reported cautiously) + \item Loss curves and convergence behavior + \item Runtime failures and numerical stability + \item Reproducibility across runs + \item Environment setup and build complexity + \item Dependency footprint and artifact size +\end{itemize} + +--- + +\section{Track 2: Inference-Based DevOps Evaluation} + +\subsection{Objective} + +The objective of this track is to compare \textbf{deployment, security, and operational characteristics} of Python-based and Rust-based ML inference services executing the same ONNX model. + +--- + +\subsection{Inference Services Compared} + +\textbf{Python Service} +\begin{itemize} + \item FastAPI + Uvicorn + \item ONNX Runtime (Python) +\end{itemize} + +\textbf{Rust Service} +\begin{itemize} + \item Axum / Actix + \item burn-rs +\end{itemize} + +Both services expose identical inference endpoints and return identical outputs. + +--- + +\subsection{Evaluation Dimensions} + +\begin{itemize} + \item CI/CD build behavior + \item Container image size and layering + \item Cold-start latency + \item Inference latency and throughput + \item Resource utilization + \item Security and supply-chain surface +\end{itemize} + +--- + +\section{Upcoming Work} + +The following tasks are planned for the next phase of the project: +s +\begin{itemize} + \item Develop production-style inference services for both Python and Rust. + \item Write Dockerfiles for Python and Rust inference services. + \item Set up Jenkins-based CI pipelines for inference, including build, test, containerization, and security scanning. +\end{itemize} + + +\end{document} diff --git a/latex_reports/merge_script.py b/latex_reports/merge_script.py new file mode 100644 index 0000000..eddcfac --- /dev/null +++ b/latex_reports/merge_script.py @@ -0,0 +1,132 @@ +import os +import re + +directory = r"c:\Users\Valmik Belgaonkar\OneDrive\Desktop\Rust_Python_ML_PE\latex_reports" + +files = { + 'main': os.path.join(directory, 'main.tex'), + 'mnist': os.path.join(directory, 'mnist.tex'), + 'regression': os.path.join(directory, 'regression.tex'), + 'text': os.path.join(directory, 'text_classification_news.tex'), + 'lstm': os.path.join(directory, 'lstm.tex') +} + +out_file = os.path.join(directory, 'combined_research_paper.tex') + +def extract_body(file_path): + if not os.path.exists(file_path): + print(f"Warning: {file_path} not found.") + return "" + + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Extract everything between \begin{document} and \end{document} + match = re.search(r'\\begin\{document\}(.*?)\\end\{document\}', content, re.DOTALL) + if match: + body = match.group(1) + # Remove \maketitle, \tableofcontents, \newpage as we will have a unified one + body = re.sub(r'\\maketitle', '', body) + body = re.sub(r'\\tableofcontents', '', body) + # Shift sections down since these will be nested under a main Task section, EXCEPT for main.tex + if 'main.tex' not in file_path: + body = body.replace('\\subsubsection{', '\\paragraph{') + body = body.replace('\\subsection{', '\\subsubsection{') + body = body.replace('\\section{', '\\subsection{') + + # Also catch the asterisk versions like \section*{...} + body = body.replace('\\subsubsection*{', '\\paragraph*{') + body = body.replace('\\subsection*{', '\\subsubsection*{') + body = body.replace('\\section*{', '\\subsection*{') + return body.strip() + return "" + +def process_tables_for_twocolumn(body): + # Change \begin{table}[*] to \begin{table*}[*] to span two columns + body = re.sub(r'\\begin\{table\}\[.*?\]', r'\\begin{table*}[t!]', body) + body = re.sub(r'\\begin\{table\}', r'\\begin{table*}[t!]', body) + body = re.sub(r'\\end\{table\}', r'\\end{table*}', body) + return body + +print("Extracting contents...") +main_body = extract_body(files['main']) +mnist_body = extract_body(files['mnist']) +regression_body = extract_body(files['regression']) +text_body = extract_body(files['text']) +lstm_body = extract_body(files['lstm']) + +# The user wants twocolumn, so we use table* to ensure wide tables don't break. +mnist_body = process_tables_for_twocolumn(mnist_body) +regression_body = process_tables_for_twocolumn(regression_body) +text_body = process_tables_for_twocolumn(text_body) +lstm_body = process_tables_for_twocolumn(lstm_body) +main_body = process_tables_for_twocolumn(main_body) + +preamble = r"""\documentclass[10pt,a4paper,twocolumn]{article} + +\usepackage{geometry} +\geometry{margin=0.75in, columnsep=0.25in} + +\usepackage{amsmath,amssymb} +\usepackage{graphicx} +\usepackage{booktabs} +\usepackage{hyperref} +\usepackage{enumitem} +\usepackage{array} +\usepackage{float} +\usepackage{listings} +\usepackage{xcolor} +\usepackage{pdfpages} +\usepackage{minted} + +\lstdefinelanguage{Dockerfile}{ + keywords={FROM, RUN, CMD, LABEL, MAINTAINER, EXPOSE, ENV, ADD, COPY, ENTRYPOINT, VOLUME, USER, WORKDIR, ARG, ONBUILD, STOPSIGNAL, HEALTHCHECK, SHELL}, + sensitive=true, + comment=[l]{\#}, + morestring=[b]", +} + +\title{\textbf{System-Level Evaluation of Rust and Python for Machine Learning}} +\author{Project Elective} +\date{\today} + +\begin{document} + +\maketitle +""" + +postamble = r""" +\end{document} +""" + +print(f"Writing combined file to {out_file}...") +with open(out_file, 'w', encoding='utf-8') as f: + f.write(preamble) + + # Write main.tex content + f.write(main_body) + f.write("\n\n\\clearpage\n\n") + + # Write MNIST content + f.write(r"\section{Task: MNIST Image Classification}" + "\n") + f.write(mnist_body) + f.write("\n\n\\clearpage\n\n") + + # Write Regression content + f.write(r"\section{Task: Regression}" + "\n") + f.write(regression_body) + f.write("\n\n\\clearpage\n\n") + + # Write Text Classification content + f.write(r"\section{Task: Text Classification (AG News)}" + "\n") + f.write(text_body) + f.write("\n\n\\clearpage\n\n") + + # Write LSTM content + f.write(r"\section{Task: LSTM implementation}" + "\n") + f.write(lstm_body) + f.write("\n\n\\clearpage\n\n") + + f.write(postamble) + +print("Combined file created successfully!") diff --git a/latex_reports/mnist.tex b/latex_reports/mnist.tex new file mode 100644 index 0000000..79b9052 --- /dev/null +++ b/latex_reports/mnist.tex @@ -0,0 +1,685 @@ +\documentclass[11pt,a4paper]{article} + +\usepackage{geometry} +\geometry{margin=1in} + +\usepackage{amsmath,amssymb} +\usepackage{graphicx} +\usepackage{booktabs} +\usepackage{hyperref} +\usepackage{enumitem} +\usepackage{array} +\usepackage{float} +\usepackage{listings} +\usepackage{xcolor} +\usepackage{pdfpages} + + +\lstdefinelanguage{Dockerfile}{ + keywords={FROM, RUN, CMD, LABEL, MAINTAINER, EXPOSE, ENV, ADD, COPY, ENTRYPOINT, VOLUME, USER, WORKDIR, ARG, ONBUILD, STOPSIGNAL, HEALTHCHECK, SHELL}, + sensitive=true, + comment=[l]{\#}, + morestring=[b]", +} + +\title{\textbf{MNIST}} +\date{\today} + +\begin{document} + +\maketitle + +\section{Architecture Details} + +\begin{table}[h!] +\centering +\renewcommand{\arraystretch}{1.3} +\begin{tabular}{|c|l|l|l|l|} +\hline +\textbf{Step} & \textbf{Layer} & \textbf{Configuration} & \textbf{Input Shape} & \textbf{Output Shape} \\ +\hline + +1 & Input & Grayscale Images & +$[B, H, W]$ & +$[B, H, W]$ \\ + +\hline +2 & Reshape & Add channel dimension & +$[B, H, W]$ & +$[B, 1, H, W]$ \\ + +\hline +3 & Conv2D (conv1) & +$1 \rightarrow 8$, kernel $3 \times 3$ & +$[B, 1, H, W]$ & +$[B, 8, H-2, W-2]$ \\ + +\hline +4 & Dropout & +$p = 0.5$ & +$[B, 8, H-2, W-2]$ & +$[B, 8, H-2, W-2]$ \\ + +\hline +5 & Conv2D (conv2) & +$8 \rightarrow 16$, kernel $3 \times 3$ & +$[B, 8, H-2, W-2]$ & +$[B, 16, H-4, W-4]$ \\ + +\hline +6 & Dropout & +$p = 0.5$ & +$[B, 16, H-4, W-4]$ & +$[B, 16, H-4, W-4]$ \\ + +\hline +7 & ReLU & +Activation & +$[B, 16, H-4, W-4]$ & +$[B, 16, H-4, W-4]$ \\ + +\hline +8 & Adaptive Avg Pool & +Output size $8 \times 8$ & +$[B, 16, H-4, W-4]$ & +$[B, 16, 8, 8]$ \\ + +\hline +9 & Flatten & +$16 \times 8 \times 8$ & +$[B, 16, 8, 8]$ & +$[B, 1024]$ \\ + +\hline +10 & Linear (fc1) & +$1024 \rightarrow \texttt{hidden\_size}$ & +$[B, 1024]$ & +$[B, \texttt{hidden\_size}]$ \\ + +\hline +11 & Dropout & +$p = 0.5$ & +$[B, \texttt{hidden\_size}]$ & +$[B, \texttt{hidden\_size}]$ \\ + +\hline +12 & ReLU & +Activation & +$[B, \texttt{hidden\_size}]$ & +$[B, \texttt{hidden\_size}]$ \\ + +\hline +13 & Linear (fc2) & +$\texttt{hidden\_size} \rightarrow \texttt{num\_classes}$ & +$[B, \texttt{hidden\_size}]$ & +$[B, \texttt{num\_classes}]$ \\ + +\hline +\end{tabular} +\caption{Detailed architecture of the convolutional neural network implemented in Burn. +$B$ denotes batch size, $H$ and $W$ denote input image height and width respectively.} +\label{tab:burn-cnn-architecture} +\end{table} + +\noindent\textbf{Notes:} +\begin{itemize} + \item All convolution layers use default stride = 1 and no padding. + \item Dropout probability is configurable via \texttt{ModelConfig.dropout}. + \item Adaptive average pooling ensures a fixed spatial resolution regardless of input size. + \item The model is fully differentiable and backend-agnostic via Burn's \texttt{Backend} trait. +\end{itemize} + + +\section{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{RustLocust/MNIST.pdf} + + +\section{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{PythonLocust/MNIST.pdf} + + +\section{Rust: Dockerfile Design and Containerization Strategy} + +The Dockerfile used for the Rust-based MNIST inference application follows a multi-stage build strategy. Multi-stage builds are commonly used to reduce the size of the final container image by separating the compilation environment from the runtime environment. + +\subsection{Overview of Multi-Stage Build} + +The Dockerfile is divided into two major stages: + +\begin{enumerate} + \item Builder Stage + \item Runtime Stage +\end{enumerate} + +The builder stage is responsible for compiling the Rust application, while the runtime stage contains only the compiled binary and the required runtime dependencies. + +\subsection{Builder Stage} + +The first stage begins with: + +\begin{verbatim} +FROM ubuntu:16.04 AS builder +\end{verbatim} + +This instruction uses Ubuntu 16.04 as the base image for building the Rust application. The alias \texttt{builder} is assigned to this stage so that its outputs can later be referenced in the runtime stage. + +\subsubsection{Working Directory} + +\begin{verbatim} +WORKDIR /app/rust_ml +\end{verbatim} + +The \texttt{WORKDIR} instruction sets the default working directory inside the container to: + +\begin{verbatim} +/app/rust_ml +\end{verbatim} + +All subsequent commands in the builder stage are executed relative to this directory. + +\subsubsection{Installing Build Dependencies} + +The following command installs the required packages for compiling the Rust project: + +\begin{verbatim} +RUN apt-get update && apt-get install -y \ + curl \ + build-essential \ + pkg-config \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* +\end{verbatim} + +Each package serves a specific purpose: + +\begin{itemize} + \item \texttt{curl}: Used to download external files, including the Rust installation script. + \item \texttt{build-essential}: Provides common compilation tools such as \texttt{gcc}, \texttt{g++}, and \texttt{make}. + \item \texttt{pkg-config}: Helps discover system libraries during the build process. + \item \texttt{ca-certificates}: Ensures secure HTTPS communication when downloading dependencies. +\end{itemize} + +The final cleanup command: + +\begin{verbatim} +rm -rf /var/lib/apt/lists/* +\end{verbatim} + +removes cached package lists to reduce image size. + +\subsubsection{Installing Rust} + +Rust is installed using the official Rust installer: + +\begin{verbatim} +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +\end{verbatim} + +This command downloads and executes the \texttt{rustup} installer. + +The flags used have the following meanings: + +\begin{itemize} + \item \texttt{--proto '=https'}: Restricts downloads to HTTPS only. + \item \texttt{--tlsv1.2}: Forces the use of TLS version 1.2 for secure transport. + \item \texttt{-sSf}: Makes \texttt{curl} silent while still showing errors if the download fails. + \item \texttt{-y}: Automatically accepts all installation prompts. +\end{itemize} + +After Rust is installed, the PATH environment variable is updated: + +\begin{verbatim} +ENV PATH="/root/.cargo/bin:${PATH}" +\end{verbatim} + +This ensures that Rust tools such as \texttt{cargo} and \texttt{rustc} are available in subsequent commands. + +\subsubsection{Copying Source Code} + +\begin{verbatim} +COPY . . +\end{verbatim} + +This instruction copies the entire project directory from the host system into the current working directory inside the container. + +\subsubsection{Building the Application} + +\begin{verbatim} +RUN cargo build --release -p mnist_infer +\end{verbatim} + +This command compiles the Rust project in release mode. + +The options used are: + +\begin{itemize} + \item \texttt{--release}: Builds the application with compiler optimizations enabled. + \item \texttt{-p mnist\_infer}: Specifies that only the \texttt{mnist\_infer} package should be compiled. +\end{itemize} + +The generated executable is stored in: + +\begin{verbatim} +/app/rust_ml/target/release/mnist_infer +\end{verbatim} + +\subsection{Runtime Stage} + +The second stage begins with: + +\begin{verbatim} +FROM nvidia/vulkan:1.3-470 +\end{verbatim} + +This stage uses an NVIDIA Vulkan runtime image as the base image. The purpose of using this image is to provide Vulkan-related runtime libraries and GPU compatibility for applications that may rely on Vulkan acceleration. + +Compared to the builder image, this runtime image is significantly smaller because it does not contain compilation tools, Rust compilers, or source code. + + +\subsubsection{Runtime Working Directory} + +\begin{verbatim} +WORKDIR /app +\end{verbatim} + +This sets the runtime working directory to: + +\begin{verbatim} +/app +\end{verbatim} + +All runtime files are placed relative to this location. + +\subsubsection{Copying the Compiled Binary} + +\begin{verbatim} +COPY --from=builder /app/rust_ml/target/release/mnist_infer /app/binary +\end{verbatim} + +This instruction copies the compiled executable from the builder stage into the runtime image. + +The \texttt{--from=builder} option tells Docker to retrieve the file from the stage named \texttt{builder}. + +The binary is renamed from: + +\begin{verbatim} +mnist_infer +\end{verbatim} + +to: + +\begin{verbatim} +/app/binary +\end{verbatim} + +inside the runtime container. + +\subsubsection{Copying the Model File} + +\begin{verbatim} +COPY ./model/mnist_rust/model.mpk /app/model/mnist_rust/model.mpk +\end{verbatim} + +This instruction copies the trained model file into the runtime container. + +The model file is stored at: + +\begin{verbatim} +/app/model/mnist_rust/model.mpk +\end{verbatim} + +The application can later load this file during inference. + +\subsubsection{Environment Variables} + +Two environment variables are defined: + +\begin{verbatim} +ENV RUST_LOG=info +ENV MODEL_PATH=/app/model/mnist_rust/model.mpk +\end{verbatim} + +Their purposes are: + +\begin{itemize} + \item \texttt{RUST\_LOG=info}: Enables logging at the info level. + \item \texttt{MODEL\_PATH}: Stores the path to the trained model file. +\end{itemize} + +Using environment variables makes the application more flexible because configuration values can be changed without modifying the source code. + +\subsubsection{Exposing the Application Port} + +\begin{verbatim} +EXPOSE 9050 +\end{verbatim} + +This instruction documents that the containerized application listens on port 9050. + +Although \texttt{EXPOSE} does not automatically publish the port to the host system, it informs users and orchestration tools such as Docker Compose or Kubernetes which port should be mapped. + +\subsubsection{Container Startup Command} + +\begin{verbatim} +CMD ["./binary"] +\end{verbatim} + +This instruction defines the default command executed when the container starts. + +The compiled Rust binary is launched directly from the runtime working directory. + +\subsection{Advantages of the Dockerfile Design} + +This Dockerfile provides several important advantages: + +\begin{itemize} + \item Reduced final image size through multi-stage builds. + \item Separation of build dependencies and runtime dependencies. + \item Improved security because the runtime image does not contain compilers or source code. + \item Faster deployment due to a lightweight runtime container. + \item Better portability because the same container can run consistently across different environments. + \item Easier maintenance through the use of environment variables and explicit working directories. +\end{itemize} + +Overall, this Dockerfile is designed to efficiently package the Rust-based MNIST inference application for deployment while minimizing runtime overhead and maintaining reproducibility. + +\section{Python (PyTorch) Dockerfile} + +This section details the image optimization strategy implemented for the MNIST inference container. The core approach minimizes the Docker image size by decoupling the heavy machine learning dependencies (PyTorch, etc.) from the application container. Instead of baking these libraries into the image, they are stored on an external volume (NFS share) and mounted at runtime. + +\subsection{Dockerfile Analysis} + +The \texttt{Dockerfile} is kept intentionally lightweight. By excluding large dependencies like \texttt{torch} from the \texttt{pip install} command, the image size remains very small (only containing the base Python runtime and lightweight web frameworks). + +\begin{lstlisting}[language=Dockerfile, caption={Optimized Inference Dockerfile}, label={lst:dockerfile_inference}] +FROM python:3.12-slim + +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV OMP_NUM_THREADS=1 +ENV MKL_NUM_THREADS=1 + +# Critical: Point Python to the external volume +ENV PYTHONPATH=/external-libs/ml_env/lib/python3.12/site-packages + +WORKDIR /app + +# Only install lightweight app dependencies +RUN pip install --no-cache-dir --upgrade pip && \ + pip install fastapi==0.110.0 uvicorn==0.29.0 python-multipart==0.0.9 + +COPY app.py model.py model.pt ./ + +EXPOSE 8000 + +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"] +\end{lstlisting} + +\begin{itemize} + \item \textbf{Base Image:} Uses \texttt{python:3.12-slim} to minimise the OS footprint. + \item \textbf{Environment Configuration:} + \begin{itemize} + \item \texttt{PYTHONDONTWRITEBYTECODE=1}: Prevents Python from writing \texttt{.pyc} files to disk. + \item \textbf{\texttt{PYTHONPATH}}: Crucially set to \texttt{/external-libs/ml\_env/lib/python3.12/site-packages}. This instructs the Python interpreter to look for libraries in the mounted volume directory, not just the default system paths. + \end{itemize} + \item \textbf{Minimal Dependencies:} The \texttt{pip install} command only installs \texttt{fastapi}, \texttt{uvicorn}, and \texttt{python-multipart}. Heavy ML libraries are assumed to be present in the mounted volume. +\end{itemize} + +\subsection{Volume Mounting Strategy} + +The strategy relies on two shell scripts to set up the environment on the host machine and run the container with the correct volume mappings. + +\subsubsection{Library Setup (\texttt{mount\_libs.sh})} +This script runs on the host machine (or a VM node) to prepare the shared library volume. +\begin{enumerate} + \item \textbf{NFS Client Installation:} It installs \texttt{nfs-common} to enable Network File System capabilities. + \item \textbf{Mounting:} It connects to a remote NFS server (\texttt{172.16.203.14}) where the pre-installed ML libraries reside. + \item \textbf{Local Path:} The remote libraries are mounted to \texttt{/mnt/ml-libs} on the host. This directory acts as the bridge between the NFS server and the Docker container. +\end{enumerate} + +\subsubsection{Runtime Execution (\texttt{run\_container.sh})} +This script launches the Docker container with the necessary runtime configurations to access the external libraries. + +\begin{lstlisting}[language=Bash, caption={Container Execution Command}] +docker run -d \ + -v /mnt/ml-libs:/external-libs \ + -e PYTHONPATH=/external-libs/ml_env/lib/python3.12/site-packages \ + -p 8000:8000 \ + fastapi-ml-app +\end{lstlisting} + +\begin{itemize} + \item \textbf{\texttt{-v /mnt/ml-libs:/external-libs}}: This bind mount maps the host's \texttt{/mnt/ml-libs} (which contains the NFS data) to \texttt{/external-libs} inside the container. + \item \textbf{\texttt{-e PYTHONPATH=...}}: explicit environment variable override ensures the container's Python runtime finds the packages in \texttt{/external-libs}. +\end{itemize} + +\subsection{Benefits and Optimization} + +\begin{table}[h!] +\centering +\caption{Optimization Benefits} +\label{tab:docker_optimization} +\begin{tabular}{|l|p{6cm}|p{6cm}|} +\hline +\textbf{Feature} & \textbf{Standard Approach} & \textbf{Volume Mount Approach} \\ \hline +\textbf{Image Size} & \textbf{Huge} ($>2GB$). Includes PyTorch, CUDA binaries, and all dependencies. & \textbf{Tiny} (~100MB). Only contains app code and minimal HTTP libs. \\ \hline +\textbf{Build Time} & \textbf{Slow}. Downloading and installing PyTorch takes minutes. & \textbf{Fast}. setup only installs \texttt{fastapi}. \\ \hline +\textbf{Updates} & requires rebuilding and pushing large layers for every code change. & Code changes only require rebuilding the tiny app layer. Library updates are handled externally. \\ \hline +\end{tabular} +\end{table} + +This architecture allows for rapid deployment and updating of the application logic without the overhead of moving gigabytes of container layers for unchanged machine learning dependencies. + +\section{Rust: CNN Model Architecture and Training Performance} + +The convolutional neural network (CNN) model used for the experiment consisted of two convolutional layers followed by adaptive average pooling, dropout, and two fully connected layers. The complete architecture is shown below: + +\begin{verbatim} +Model { + conv1: Conv2d {ch_in: 1, ch_out: 8, stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Valid, params: 80} + conv2: Conv2d {ch_in: 8, ch_out: 16, stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Valid, params: 1168} + pool: AdaptiveAvgPool2d {output_size: [8, 8]} + dropout: Dropout {prob: 0.5} + linear1: Linear {d_input: 1024, d_output: 512, bias: true, params: 524800} + linear2: Linear {d_input: 512, d_output: 10, bias: true, params: 5130} + activation: Relu + params: 531178 +} +\end{verbatim} + +The model was trained for 10 epochs. Over the course of training, both the training and validation performance improved consistently. Training accuracy increased from 81.575\% in the first epoch to 97.300\% in the final epoch, while validation accuracy improved from 92.133\% to 98.517\%. + +Similarly, the training loss decreased significantly from 0.656 to 0.087, and the validation loss reduced from 0.258 to 0.054 by the end of training. The macro F1-score also improved substantially, reaching 96.974\% for training and 98.321\% for validation. + +\begin{table}[h] +\centering +\caption{Training and Validation Metrics Summary} +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min.} & \textbf{Epoch} & \textbf{Max.} & \textbf{Epoch} \\ +\hline +Train & Accuracy & 81.575 & 1 & 97.300 & 10 \\ +Train & Loss & 0.087 & 10 & 0.656 & 1 \\ +Train & Precision@Top1 [Macro] & 82.126 & 1 & 97.304 & 10 \\ +Train & Recall@Top1 [Macro] & 81.286 & 1 & 97.232 & 10 \\ +Train & F1-Score@Top1 [Macro] & 79.715 & 1 & 96.974 & 10 \\ +Train & Top-5 Accuracy & 97.696 & 1 & 99.969 & 10 \\ +Train & CPU Memory (GB) & 2.514 & 2 & 2.927 & 10 \\ +Train & CPU Usage (\%) & 20.753 & 5 & 30.394 & 10 \\ +\hline +Valid & Accuracy & 92.133 & 1 & 98.517 & 10 \\ +Valid & Loss & 0.054 & 10 & 0.258 & 1 \\ +Valid & Precision@Top1 [Macro] & 92.154 & 1 & 98.527 & 10 \\ +Valid & Recall@Top1 [Macro] & 91.978 & 1 & 98.425 & 10 \\ +Valid & F1-Score@Top1 [Macro] & 91.176 & 1 & 98.321 & 10 \\ +Valid & Top-5 Accuracy & 99.583 & 1 & 99.967 & 10 \\ +Valid & CPU Memory (GB) & 2.514 & 2 & 3.085 & 10 \\ +Valid & CPU Usage (\%) & 20.539 & 2 & 39.652 & 10 \\ +\hline +\end{tabular} +\label{tab:cnn_metrics_summary} +\end{table} + +The results indicate that the model achieved strong generalization performance with minimal overfitting, as the validation accuracy remained slightly higher than the training accuracy throughout the experiment. The consistently high Top-5 accuracy values further demonstrate that the model was able to correctly identify the correct class within its top predictions. + +It should also be noted that the execution terminated with a segmentation fault after training completion. However, since the fault occurred after all epochs had been completed and metrics had already been recorded, it did not affect the validity of the training results. \\ + +The time taken to train the model is 229.916s (3min 49.916s). + + + +\section{Python: CNN Model Architecture and Training Performance} + +The convolutional neural network (CNN) model implemented in Python mirrors the Rust architecture, consisting of two convolutional layers followed by adaptive average pooling, dropout regularization, and two fully connected layers. + +\textbf{Model Architecture:} + +\begin{verbatim} +Model { + conv1: Conv2d {ch_in: 1, ch_out: 8, kernel_size: [3, 3], stride: [1, 1]} + conv2: Conv2d {ch_in: 8, ch_out: 16, kernel_size: [3, 3], stride: [1, 1]} + pool: AdaptiveAvgPool2d {output_size: [8, 8]} + dropout: Dropout {prob: 0.5} + linear1: Linear {d_input: 1024, d_output: 512, params: 524800} + linear2: Linear {d_input: 512, d_output: 10, params: 5130} + activation: ReLU + total params: 531178 +} +\end{verbatim} + +The model was trained for 10 epochs. Training and validation performance improved consistently over time. + +Training accuracy increased from 82.01\% to 97.29\%, while validation accuracy improved from 92.87\% to 98.20\%. +Training loss decreased significantly from 0.5947 to 0.0867, and validation loss reduced from 0.2475 to 0.0579. + +The macro F1-score reached 0.9814, demonstrating strong classification performance. Additionally, the Top-5 accuracy achieved 99.98\%, indicating highly reliable predictions. + +\begin{table}[h] +\centering +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min} & \textbf{Epoch} & \textbf{Max} & \textbf{Epoch} \\ +\hline + +Train & Accuracy & 82.01 & 1 & 97.29 & 10 \\ +Train & Loss & 0.0867 & 10 & 0.5947 & 1 \\ +Train & Precision@Top1 [Macro] & -- & -- & -- & -- \\ +Train & Recall@Top1 [Macro] & -- & -- & -- & -- \\ +Train & F1-Score@Top1 [Macro] & -- & -- & -- & -- \\ +Train & Top-5 Accuracy & -- & -- & -- & -- \\ +Train & CPU Memory (GB) & 0.84 & -- & 0.98 & -- \\ +Train & CPU Usage (\%) & 72.1 & -- & 92.0 & -- \\ +\hline + +Valid & Accuracy & 92.87 & 1 & 98.20 & 10 \\ +Valid & Loss & 0.0579 & 10 & 0.2475 & 1 \\ +Valid & Precision@Top1 [Macro] & 0.9816 & -- & 0.9816 & -- \\ +Valid & Recall@Top1 [Macro] & 0.9812 & -- & 0.9812 & -- \\ +Valid & F1-Score@Top1 [Macro] & 0.9814 & -- & 0.9814 & -- \\ +Valid & Top-5 Accuracy & 99.98 & -- & 99.98 & -- \\ +Valid & CPU Memory (GB) & 0.84 & -- & 0.98 & -- \\ +Valid & CPU Usage (\%) & 72.1 & -- & 92.0 & -- \\ +\hline + +\end{tabular} +\caption{Python Training and Validation Metrics Summary} +\end{table} + +The results indicate strong generalization performance with no signs of overfitting. Validation accuracy remained consistently high and closely followed training accuracy. + +Training was stable with zero NaN events observed. The total training time was 182.23 seconds, with an average epoch time of 15.05 seconds and an average iteration speed of 50.58 iterations per second. + + + +\newpage + +\section{Container Comparison: Python vs Rust} + +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 3.98GB & 1.02GB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + +\section{Model Size Comparison} + +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + + +\end{document} diff --git a/latex_reports/regression.tex b/latex_reports/regression.tex new file mode 100644 index 0000000..2b18f1b --- /dev/null +++ b/latex_reports/regression.tex @@ -0,0 +1,333 @@ +\documentclass[12pt]{article} +\usepackage{amsmath} +\usepackage{amssymb} +\usepackage{graphicx} +\usepackage{hyperref} +\usepackage{geometry} +\geometry{a4paper, margin=1in} +\usepackage{pdfpages} + + +\title{\textbf{Comparative Analysis of Regression Implementations: Rust (Burn) vs. PyTorch}} +\author{Technical Report} +\date{\today} + +\begin{document} + +\maketitle +\tableofcontents +\newpage + +\section{Introduction} +This document outlines the architectural, mathematical, and deployment specifics of implementing a Neural Network-based Regression model across two disparate machine learning environments: Rust (utilizing the Burn framework) and Python (utilizing PyTorch). It covers the distinct model architecture decisions, dataset handling strategies, and specialized pipeline deployment techniques leveraging Network File Systems (NFS) mapping via Docker bounds. + +\section{Model Architecture and Mathematical Formulation} + +\subsection{Mathematical Foundation} +The core mathematical foundation deployed across both frameworks is a classical Feed-Forward Neural Network consisting of a single hidden dimension mapping inputs directly onto a continuous single-variable regression output. + +For a given input feature vector $X \in \mathbb{R}^N$ (where $N$ dictates the feature size depending on the target dataset), the network's forward transformation can be represented sequentially as: +\begin{align} + Z_1 &= X \cdot W_1^T + b_1 \quad &\text{(Input Projection)} \\ + A_1 &= \max(0, Z_1) \quad &\text{(ReLU Activation)} \\ + \hat{Y} &= A_1 \cdot W_2^T + b_2 \quad &\text{(Output Projection)} +\end{align} + +Where: +\begin{itemize} + \item $W_1 \in \mathbb{R}^{H \times N}$ and $b_1 \in \mathbb{R}^H$ map the inputs onto the hidden vector space $H$. + \item $\max(0, \cdot)$ denotes the Non-Linear Rectified Linear Unit (ReLU) mapping algorithm. + \item $W_2 \in \mathbb{R}^{1 \times H}$ and $b_2 \in \mathbb{R}$ collapse the hidden abstraction onto the finalized regression scalar prediction $\hat{Y}$. +\end{itemize} + +\subsection{Architectural Configurations} +While the mathematical foundations are identical, implementations slightly differ based on dataset selections within the modules: +\begin{itemize} + \item \textbf{PyTorch Architecture:} Configures $N=13$ input features mapping to $H=64$ hidden parameters. + \item \textbf{Rust (Burn) Architecture:} Configures $N=8$ input features concurrently mapping to $H=64$ hidden parameters. +\end{itemize} +In both configurations, standard parameter biases (`bias=True`) are included and automatically initialized. + +\section{Training Pipelines} + +Both codebases train the model iteratively tracking gradients via the Adam optimizer scaled against Mean Squared Error (MSE) loss logic: +\[ \text{MSE} = \frac{1}{B} \sum_{i=1}^{B} (Y_i - \hat{Y}_i)^2 \] + +\subsection{PyTorch Context} +\begin{itemize} + \item \textbf{Data Loading:} Automatically pulls the \textbf{Boston Housing} dataset array (.npz file) from an external Google API via `urllib` and manually partitions it down into an explicit $80/20$ split. + \item \textbf{Telemetry Metrics:} Generates explicit hardware tracking loops inside the main epoch runner. Uses the `psutil` library to compute and stream epoch `iteration\_speed`, raw RAM consumption, and `cpu\_temp` hardware sensors parallel to the loss parameters. +\end{itemize} + +\subsection{Rust (Burn) Context} +\begin{itemize} + \item \textbf{Data Loading:} Links into Huggingface's dataset registry asynchronously targeting the \textbf{California Housing} SQLized splits mapping onto memory arrays via localized `HousingDistrictItem` structs. + \item \textbf{Normalization Mapping:} Computes spatial min-max normalizations programmatically over inputs during training: + \[ X_{norm} = \frac{X - \text{min}}{\text{max} - \text{min}} \] + This logic restricts features within standard boundaries precluding exploding gradient derivations. +\end{itemize} + +\section{Inference Pipeline and Docker NFS Integration} + +Deploying these isolated pipelines necessitates radically different execution strategies, highlighting Python's heavyweight runtime dependency bottlenecks versus Rust's compile-time optimizations. + +\subsection{PyTorch Inference Architecture} +Standard PyTorch Docker environments routinely eclipse several gigabytes due to CUDA bindings and generic scientific computation loops. To circumvent this inside microservices, the PyTorch inference pipeline mandates a hybrid Network File System (NFS) mapping architecture: +\begin{enumerate} + \item \textbf{NFS Mounting (\texttt{mount\_libs.sh}):} Installs an external `nfs-common` client locally and binds the extensive python library volume from an external dedicated storage server (`172.16.203.14`) into the host machine's `/mnt/LSTM-libs` map. + \item \textbf{Lightweight Container Image:} The backend \texttt{Dockerfile} avoids `pip install` commands completely, simply initializing a barebone `nvidia/cuda:12.1.1` image mapping Python $3.11$ system links. + \item \textbf{Volume Inject (\texttt{run\_container.sh}):} The script initializes the container enforcing `-v` flags that sync the NFS `/mnt/LSTM-libs` directory seamlessly onto the Docker's `/external-libs`. Crucially, it overrides the system \texttt{PYTHONPATH} to target those external `site-packages` at runtime. + \item \textbf{Execution:} The `FastAPI` instance loads, bypasses massive disk pulls, links the models iteratively, and fields inbound `HousingFeatures` lists continuously. +\end{enumerate} + +\subsection{Rust (Burn) Inference Architecture} +Rust handles Docker microservices inherently via statically linked deployments: +\begin{itemize} + \item \textbf{Multi-Stage Compiling:} Executes a build phase operating within an oversized `rust:1.92-alpine` chain, ejecting the resulting binary onto an isolated stripped `alpine:3.23` environment structure. + \item \textbf{Native Routing:} Utilizes \texttt{Axum} servers to establish the HTTP logic endpoints securely routing JSON payloads mapping to specific feature names (e.g. \texttt{median\_income}, \texttt{house\_age}). +\end{itemize} + +\section{Rust: Regression Model Performance Analysis} + +The regression model used in this experiment was a simple feed-forward neural network consisting of one hidden layer followed by an output layer. The model was designed to predict the median house value based on eight input features. + +\subsection{Model Architecture} + +The architecture of the regression model is shown below: + +\begin{verbatim} +RegressionModel { + input_layer: Linear {d_input: 8, d_output: 64, bias: true, params: 576} + output_layer: Linear {d_input: 64, d_output: 1, bias: true, params: 65} + activation: Relu + params: 641 +} +\end{verbatim} + +The model contains: + +\begin{itemize} + \item An input layer that maps 8 input features to 64 hidden units + \item A ReLU activation function applied after the hidden layer + \item An output layer that maps the 64 hidden units to a single scalar value +\end{itemize} + +The total number of trainable parameters in the model was only 641, making it a lightweight model suitable for fast training and inference. + +\subsection{Training Configuration} + +The model was trained for 100 epochs. A constant learning rate of: + +\[ +1.0 \times 10^{-3} +\] + +was used throughout the entire training process. + +\subsection{Training Performance} + +The training loss decreased substantially over the 100 epochs. Initially, the model started with a training loss of 3.086 during the first epoch. By the final epoch, the loss had reduced to 0.414. + +This significant reduction in loss indicates that the model successfully learned the underlying relationship between the input features and the target variable. + +\subsection{Validation Performance} + +Validation loss also showed a considerable improvement during training. The validation loss decreased from 4.132 in the first epoch to a minimum of 0.635 at epoch 51. + +The difference between the final training loss and the minimum validation loss suggests that the model achieved good generalization performance without severe overfitting. + +\begin{table}[h] +\centering +\caption{Training and Validation Metrics Summary for the Regression Model} +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min.} & \textbf{Epoch} & \textbf{Max.} & \textbf{Epoch} \\ +\hline +Train & Loss & 0.414 & 100 & 3.086 & 1 \\ +Train & Learning Rate & $1.0 \times 10^{-3}$ & 1 & $1.0 \times 10^{-3}$ & 100 \\ +Train & CPU Memory (GB) & 2.125 & 4 & 2.325 & 56 \\ +Train & CPU Usage (\%) & 19.539 & 54 & 37.989 & 11 \\ +\hline +Valid & Loss & 0.635 & 51 & 4.132 & 1 \\ +Valid & CPU Memory (GB) & 2.124 & 3 & 2.325 & 55 \\ +Valid & CPU Usage (\%) & 19.550 & 54 & 37.960 & 11 \\ +\hline +\end{tabular} +\label{tab:regression_metrics_summary} +\end{table} + +\subsection{Prediction Example} + +A sample prediction generated by the model is shown below: + +\begin{verbatim} +Predicted 2.021734 Expected 2.158 +\end{verbatim} + +The predicted value is reasonably close to the expected value, indicating that the model was able to approximate the target variable with acceptable accuracy. + +Since the median house value was measured in units of 100,000 dollars, the prediction corresponds to: + +\begin{itemize} + \item Predicted value: approximately 202,173 dollars + \item Expected value: approximately 215,800 dollars +\end{itemize} + +\subsection{Predicted vs. Expected Distribution} + +The predicted-versus-expected plot suggests that the model captures the general trend in the target values, although some prediction errors remain for certain samples. + +Most of the predicted values appear concentrated around the central region of the distribution, indicating that the model performs better on common house value ranges than on extreme values. + +\subsection{Resource Utilization} + +The model required relatively little memory during execution. Training memory usage ranged from 2.125 GB to 2.325 GB, while validation memory usage ranged from 2.124 GB to 2.325 GB. + +CPU utilization remained moderate throughout training. Training CPU usage ranged from 19.539\% to 37.989\%, while validation CPU usage ranged from 19.550\% to 37.960\%. + +CPU temperature values were unavailable and therefore recorded as NaN. + +\subsection{Execution Time and Failure} + +The complete training and evaluation process required: + +\begin{itemize} + \item Real time: 3 minutes and 18.257 seconds + \item User CPU time: 4 minutes and 13.554 seconds + \item System CPU time: 50.340 seconds +\end{itemize} + +\section{Language Specific Implementation Details} + +\subsection{PyTorch-Specific Paradigms} +\begin{itemize} + \item \textbf{Thread Clamping:} Due to inference optimization restrictions (especially running CPU variations alongside container structures), the `app.py` enforces explicit core binding calls via `torch.set\_num\_threads(1)` and `torch.set\_num\_interop\_threads(1)` securing computational resources and restricting OS context-switching overheads. + \item \textbf{Matrix Array Verifications:} Manually inspects raw matrix vector mappings validating dimensions dynamically against numeric constraints: \texttt{len(x) != NUM\_FEATURES} triggering runtime panics before pipeline evaluations fail. + \item \textbf{Manual Hardware Moving:} The framework is heavily littered with required `.to(device)` mapping configurations switching inputs, datasets, targets, and models manually between the host and external components. +\end{itemize} + +\subsection{Rust (Burn)-Specific Paradigms} +\begin{itemize} + \item \textbf{Generic Compile-Time Shapes:} Dimension mappings and tensor validations are fundamentally enforced inside the Rust compiler boundaries via `` arrays indicating batches of distinct input structures mapping to `targets: Tensor`. Invalid sizes fail compilation, voiding the requirement for manual PyTorch matrix validations. + \item \textbf{Struct Batching Protocols:} Inference doesn't evaluate primitive float arrays. Intead, the API relies on executing an overarching `HousingBatcher` which transforms specific struct domains (\texttt{HousingDistrictItem}) safely into tensor primitives while executing implicit `self.normalizer.to\_device(device)` logic silently against constants behind boundaries. + \item \textbf{Record Deserialization:} States are strictly detached from models via standard `.mpk` maps. They invoke explicit \texttt{NoStdTrainingRecorder::new().load()} tracking traits unbinding memory limits inherent to standard dict serialization configurations natively loaded via `RegressionModelConfig`. +\end{itemize} + +\newpage +\section{Python: Regression Model Architecture and Training Performance} + +The regression model used in this experiment is a lightweight fully connected neural network with a small number of parameters (961 total). The model is optimized using mean squared error loss. + + + +The model was trained for 100 epochs. Training loss decreased significantly from 8265.55 to 69.86 (99.15\% reduction), while validation loss decreased from 9045.34 to 55.70. + +Despite strong loss reduction, the model struggled to achieve good generalization. The validation $R^2$ score remained negative (-1.07), indicating that the model performs worse than a simple baseline predictor. + +\begin{table}[h] +\centering +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min} & \textbf{Epoch} & \textbf{Max} & \textbf{Epoch} \\ +\hline + +Train & Loss & 68.50 & 100 & 8265.55 & 1 \\ +Train & RMSE & 8.39 & 100 & 91.53 & 1 \\ +Train & MAE & 6.29 & 100 & 90.33 & 1 \\ +Train & R$^2$ & -96.93 & 1 & 0.1767 & 100 \\ +Train & Grad Norm (Total) & 118.80 & -- & 112876.03 & 1 \\ +Train & Iteration Speed (it/s) & 0.69 & 1 & 14.39 & 54 \\ +Train & CPU Memory (GB) & 0.87 & -- & 1.02 & -- \\ +Train & CPU Usage (\%) & 45.7 & -- & 97.6 & -- \\ +\hline + +Valid & Loss & 50.93 & 65 & 9045.34 & 1 \\ +Valid & RMSE & 7.14 & 65 & 95.11 & 1 \\ +Valid & MAE & 6.00 & 65 & 93.52 & 1 \\ +Valid & R$^2$ & -335.40 & 1 & -0.8940 & 65 \\ +Valid & Iteration Speed (it/s) & 0.69 & 1 & 14.39 & 54 \\ +Valid & CPU Memory (GB) & 0.87 & -- & 1.02 & -- \\ +Valid & CPU Usage (\%) & 45.7 & -- & 97.6 & -- \\ +\hline + +\end{tabular} +\caption{Regression Model Training and Validation Metrics Summary} +\end{table} + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 47.39 seconds + \item Average Epoch Time: 0.225 seconds + \item Iteration Speed (Mean): 10.36 it/s + \item Gradient Norm (Mean): 7981.31 + \item NaN Events: 0 + \item Convergence: Non-monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + +The results indicate that while optimization was successful in reducing loss, the model lacks sufficient capacity or feature representation to generalize well. The persistently negative validation $R^2$ suggests underfitting or a mismatch between model complexity and data characteristics. + + + + +\newpage + +\section{Container Comparison: Python vs Rust} + +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 3.98GB & 973MB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + + +\section{Model Size Comparison} + +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + + + + +\end{document} diff --git a/latex_reports/text_classification_news.tex b/latex_reports/text_classification_news.tex new file mode 100644 index 0000000..dafcd2c --- /dev/null +++ b/latex_reports/text_classification_news.tex @@ -0,0 +1,652 @@ +\documentclass[11pt,a4paper]{article} + +\usepackage{geometry} +\geometry{margin=1in} + +\usepackage{amsmath,amssymb} +\usepackage{graphicx} +\usepackage{booktabs} +\usepackage{hyperref} +\usepackage{enumitem} +\usepackage{array} +\usepackage{float} +\usepackage{listings} +\usepackage{pdfpages} + + +\lstdefinelanguage{Dockerfile}{ + keywords={FROM, RUN, CMD, LABEL, MAINTAINER, EXPOSE, ENV, ADD, COPY, ENTRYPOINT, VOLUME, USER, WORKDIR, ARG, ONBUILD, STOPSIGNAL, HEALTHCHECK, SHELL}, + sensitive=true, + comment=[l]{\#}, + morestring=[b]", +} + +\title{\textbf{Text Classification}} +\date{\today} + +\begin{document} + +\maketitle + +\section{Model Architecture and Training Strategy} + +The text classification system is built using the \texttt{Burn} framework in Rust, leveraging a Transformer-based architecture for feature extraction and a linear classification head. This section details the mathematical formulation of the model and the strategy employed for training. + +\subsection{Model Architecture} +The core of the model is a Transformer Encoder, which processes a sequence of token embeddings to capture contextual relationships. The architecture consists of three primary stages: embedding, encoding, and classification. + +\subsubsection{Embedding Layer} +Input text is tokenized and converted into a sequence of indices $X \in \mathbb{N}^{B \times L}$, where $B$ is the batch size and $L$ is the sequence length. The model utilizes two parallel embedding layers: +\begin{enumerate} + \item \textbf{Token Embedding ($E_{tok}$)}: Maps token indices to dense vectors of dimension $d_{model}$. + \item \textbf{Positional Embedding ($E_{pos}$)}: Maps position indices $[0, \dots, L-1]$ to dense vectors of dimension $d_{model}$ to inject sequence order information. +\end{enumerate} + +The final embedding representation $E$ is obtained by averaging the token and positional embeddings: +\begin{equation} + E = \frac{E_{tok}(X) + E_{pos}(\text{positions})}{2} +\end{equation} + +\subsubsection{Transformer Encoder} +The embedding tensor $E$ is passed through a multi-layer Transformer Encoder. Each layer consists of a Multi-Head Self-Attention (MHSA) mechanism followed by a Position-wise Feed-Forward Network (FFN), with residual connections and layer normalization. + +The configuration used in this implementation is as follows: +\begin{itemize} + \item \textbf{Model Dimension ($d_{model}$)}: 256 + \item \textbf{Feed-Forward Dimension ($d_{ff}$)}: 1024 + \item \textbf{Number of Heads ($N_{heads}$)}: 8 + \item \textbf{Number of Layers ($N_{layers}$)}: 4 + \item \textbf{Normalization}: Layer norm applied before sub-layers (Pre-Norm). +\end{itemize} + +Let $H = \text{TransformerEncoder}(E)$, where $H \in \mathbb{R}^{B \times L \times d_{model}}$ represents the contextualized representations of the input sequence. + +\subsubsection{Classification Head} +For classification, the model utilizes the representation of the first token (typically acting as the [CLS] token) from the encoded sequence. This vector is passed through a linear layer to project it into the class space: +\begin{equation} + Y = \text{Linear}(H_{[:, 0, :]}) +\end{equation} +where $Y \in \mathbb{R}^{B \times N_{classes}}$ represents the logits. For inference, a Softmax function is applied to obtain probabilities: +\begin{equation} + \hat{P} = \text{Softmax}(Y) +\end{equation} + +\begin{table}[h] +\centering +\caption{Model Architecture Summary} +\label{tab:model_arch} +\begin{tabular}{|l|l|c|c|} +\hline +\textbf{Component} & \textbf{Configuration / Details} & \textbf{Input Shape} & \textbf{Output Shape} \\ \hline +Token Embedding & $V \to d_{model}$ ($V$: Vocab Size) & $(B, L)$ & $(B, L, 256)$ \\ \hline +Pos Embedding & $L_{max} \to d_{model}$ & $(B, L)$ & $(B, L, 256)$ \\ \hline +Embedding Merge & Average ($E_{tok} + E_{pos}$) & - & $(B, L, 256)$ \\ \hline +Transformer Block & 4 Layers, 8 Heads, $d_{ff}=1024$ & $(B, L, 256)$ & $(B, L, 256)$ \\ \hline +Feature Extract & Slice First Token (Index 0) & $(B, L, 256)$ & $(B, 256)$ \\ \hline +Classifier Head & Linear ($256 \to N_{classes}$) & $(B, 256)$ & $(B, N_{classes})$ \\ \hline +\end{tabular} +\end{table} + + + +\section{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{RustLocust/text.pdf} + + +\section{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{PythonLocust/text.pdf} + + +\subsection{Training Strategy} +The model is trained using a supervised learning approach with the following configuration: + +\subsubsection{Loss Function} +The training objective is to minimize the Cross-Entropy Loss between the predicted logits $Y$ and the ground truth class labels $C$: +\begin{equation} + \mathcal{L} = \text{CrossEntropy}(Y, C) = -\sum_{c=1}^{N_{classes}} \mathbb{1}_{c=C} \log\left(\frac{e^{Y_c}}{\sum_{j} e^{Y_j}}\right) +\end{equation} + +\subsubsection{Optimization} +We employ the **Adam** optimizer with the following parameters: +\begin{itemize} + \item \textbf{Weight Decay}: $5 \times 10^{-5}$ + \item \textbf{Beta Coefficients}: Standard defaults (typically $\beta_1=0.9, \beta_2=0.999$) +\end{itemize} + +\subsubsection{Learning Rate Scheduling} +A **Noam Learning Rate Scheduler** is used to stabilize training. The learning rate increases linearly during a warmup phase and then decays proportionally to the inverse square root of the step number. +\begin{equation} + LR = d_{model}^{-0.5} \cdot \min(step\_num^{-0.5}, step\_num \cdot warmup\_steps^{-1.5}) +\end{equation} +\begin{itemize} + \item \textbf{Warmup Steps}: 1000 + \item \textbf{Base Learning Rate}: 0.01 +\end{itemize} + +\subsubsection{Metrics} +During training and validation, the following metrics are tracked to monitor performance: +\begin{itemize} + \item \textbf{Loss}: Cross-Entropy Loss. + \item \textbf{Accuracy}: Percentage of correct predictions. + \item \textbf{F1-Score, Precision, Recall}: Macro-averaged metrics to account for class balance. +\end{itemize} + +\section{Burn Code Specifications} + +This section outlines the significant implementation details of the text classification system, focusing on the architectural choices in \texttt{model.rs} and the robust training pipeline defined in \texttt{training.rs}. +\subsection{Model Implementation (\texttt{model.rs})} +The \texttt{TextClassificationModel} leverages the \textbf{Burn} framework's modular design to implement a Transformer-based classifier. Key features of this implementation include: +\begin{itemize} + \item \textbf{Dual Embedding Strategy:} The model employs two distinct embedding layers: \texttt{embedding\_token} for semantic content and \texttt{embedding\_pos} for positional information. A unique characteristic of this implementation is the fusion strategy, where these embeddings are combined via averaging: + \[ + E_{final} = \frac{E_{pos} + E_{token}}{2} + \] + This differs from the standard summation approach often found in BERT implementations, potentially stabilizing the initial magnitude of the embedding vectors. + + \item \textbf{Configurable Architecture:} The system uses a \texttt{TextClassificationModelConfig} struct derived with the \texttt{Config} macro. This allows for type-safe and serializable hyperparameter management, ensuring the model architecture (hidden size, vocabulary size, sequence length) can be easily saved, loaded, and reproducible. + + \item \textbf{Masked Attention:} The forward pass actively utilizes padding masks (\texttt{mask\_pad}). These masks are passed into the \texttt{TransformerEncoderInput}, ensuring that the self-attention mechanism strictly ignores padding tokens, which is critical for handling variable-length text sequences correctly. + + \item \textbf{Separation of Train and Inference Logic:} The model explicitly implements the \texttt{TrainStep} and \texttt{InferenceStep} traits. + \begin{itemize} + \item \textbf{Training:} Returns a \texttt{ClassificationOutput} struct containing the calculated Cross-Entropy loss for backpropagation. + \item \textbf{Inference:} Returns raw probabilities by applying a softmax activation on the output logits, facilitating direct class prediction. + \end{itemize} +\end{itemize} +\subsection{Training Pipeline (\texttt{training.rs})} +The training module is designed for reliability and comprehensive observability. It integrates advanced optimization techniques and hardware-aware monitoring. +\begin{itemize} + \item \textbf{Noam Scheduler:} Transformer models are notoriously sensitive to learning rates. The code implements the \textbf{Noam Learning Rate Scheduler} (popularized by "Attention Is All You Need"), which features a linear warmup phase (1000 steps) followed by an inverse square root decay based on the model dimension ($d_{model}$). This prevents gradient explosions during early training stages. + + \item \textbf{Distributed Training Support:} The implementation explicitly handles distributed computing scenarios. It utilizes Rust's feature flags (\texttt{cfg[feature = "ddp"]}) to switch between single-device training and \textbf{Distributed Data Parallel (DDP)} strategies. When enabled, it employs a tree-based \texttt{AllReduceStrategy} for synchronizing gradients across multiple GPUs or nodes. + + \item \textbf{Comprehensive Telemetry:} The training loop is instrumented with an extensive suite of metrics beyond simple accuracy. It tracks: + \begin{itemize} + \item \textbf{Classification Metrics:} Macro-averaged F1-Score, Precision, and Recall, providing a holistic view of model performance on imbalanced datasets. + \item \textbf{Hardware Diagnostics:} CPU temperature, memory usage, and utilization are logged alongside training progress, aiding in the detection of thermal throttling or memory leaks during long training runs. + \end{itemize} + + \item \textbf{Efficient Data Sampling:} To manage large datasets efficiently, the loader utilizes a \texttt{SamplerDataset}. This limits the effective epoch size to 50,000 training samples and 5,000 validation samples, allowing for rapid iteration and feedback loops without needing to process the entire corpus in every epoch. +\end{itemize} + +\subsection{Conditional Compilation} +I think we should document a bit about this. + +\section{Rust Docker image} + +\section{Rust Inference Code} + +\section{Rust: Transformer-Based Text Classification Model Performance} + +The text classification model used in this experiment was based on a Transformer encoder architecture. The model consisted of token embeddings, positional embeddings, a multi-layer Transformer encoder, and a final linear classification layer. + +\subsection{Model Architecture} + +The architecture of the model is shown below: + +\begin{verbatim} +TextClassificationModel { + transformer: TransformerEncoder { + d_model: 256, + d_ff: 1024, + n_heads: 8, + n_layers: 4, + dropout: 0.1, + norm_first: true, + quiet_softmax: true, + params: 3159040 + } + embedding_token: Embedding { + n_embedding: 28996, + d_model: 256, + params: 7422976 + } + embedding_pos: Embedding { + n_embedding: 256, + d_model: 256, + params: 65536 + } + output: Linear { + d_input: 256, + d_output: 4, + bias: true, + params: 1028 + } + n_classes: 4 + params: 10648580 +} +\end{verbatim} + +The Transformer encoder used four encoder layers with eight attention heads per layer. Each layer had a model dimension of 256 and a feed-forward dimension of 1024. A dropout rate of 0.1 was used to reduce overfitting. + +The token embedding layer mapped a vocabulary of 28,996 tokens into 256-dimensional vectors. Positional embeddings of length 256 were also used so that the Transformer could capture token order information. + +The final output layer mapped the Transformer representation into four output classes. + +The total number of trainable parameters in the model was 10,648,580. + +\subsection{Training Configuration} + +The model was trained for a total of 5 epochs. During training, the learning rate decayed from $1.107 \times 10^{-5}$ in the first epoch to $3.733 \times 10^{-6}$ in the final epoch. + +\subsection{Training Performance} + +Training accuracy improved steadily from 57.968\% in the first epoch to 81.474\% in the fifth epoch. Similarly, the training loss decreased from 0.981 to 0.507. + +The macro precision, recall, and F1-score also improved significantly during training: + +\begin{itemize} + \item Precision increased from 58.893\% to 81.606\% + \item Recall increased from 57.741\% to 81.334\% + \item F1-score increased from 50.201\% to 76.001\% +\end{itemize} + +These results indicate that the model learned meaningful semantic patterns in the text data over successive epochs. + +\subsection{Validation Performance} + +Validation performance also improved consistently. Validation accuracy increased from 72.280\% in the first epoch to a maximum of 81.640\% in the fourth epoch. + +The validation loss decreased from 0.731 to 0.507, showing that the model generalized reasonably well to unseen data. + +The validation precision, recall, and F1-score also showed strong improvement: + +\begin{itemize} + \item Precision increased from 72.230\% to 81.739\% + \item Recall increased from 72.258\% to 82.039\% + \item F1-score increased from 65.796\% to 76.509\% +\end{itemize} + +The relatively close values between training and validation accuracy suggest that the model did not suffer from severe overfitting. + +\begin{table}[h] +\centering +\caption{Training and Validation Metrics Summary for the Transformer-Based Text Classification Model} +% @kumar please layout sahi kar sakta hai? love you +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min.} & \textbf{Epoch} & \textbf{Max.} & \textbf{Epoch} \\ +\hline +Train & Accuracy & 57.968 & 1 & 81.474 & 5 \\ +Train & Loss & 0.507 & 5 & 0.981 & 1 \\ +Train & Precision@Top1 [Macro] & 58.893 & 1 & 81.606 & 5 \\ +Train & Recall@Top1 [Macro] & 57.741 & 1 & 81.334 & 5 \\ +Train & F1-Score@Top1 [Macro] & 50.201 & 1 & 76.001 & 5 \\ +Train & Learning Rate & $3.733 \times 10^{-6}$ & 5 & $1.107 \times 10^{-5}$ & 1 \\ +Train & CPU Memory (GB) & 2.401 & 2 & 2.736 & 5 \\ +Train & CPU Usage (\%) & 16.529 & 1 & 17.160 & 5 \\ +\hline +Valid & Accuracy & 72.280 & 1 & 81.640 & 4 \\ +Valid & Loss & 0.507 & 5 & 0.731 & 1 \\ +Valid & Precision@Top1 [Macro] & 72.230 & 1 & 81.739 & 5 \\ +Valid & Recall@Top1 [Macro] & 72.258 & 1 & 82.039 & 5 \\ +Valid & F1-Score@Top1 [Macro] & 65.796 & 1 & 76.509 & 5 \\ +Valid & CPU Memory (GB) & 2.263 & 1 & 2.747 & 5 \\ +Valid & CPU Usage (\%) & 20.331 & 1 & 22.190 & 3 \\ +\hline +\end{tabular} +\label{tab:transformer_text_classification_metrics} +\end{table} + +\subsection{Resource Utilization} + +The CPU memory usage remained relatively stable throughout training. Training memory usage ranged from 2.401 GB to 2.736 GB, while validation memory usage ranged from 2.263 GB to 2.747 GB. + +CPU utilization also remained moderate, with training CPU usage ranging from 16.529\% to 17.160\% and validation CPU usage ranging from 20.331\% to 22.190\%. + +CPU temperature values were unavailable during the experiment and therefore recorded as NaN. + +\subsection{Execution Time and Failure} + +The complete training run required: + +\begin{itemize} + \item Real time: 32 minutes and 37.872 seconds + \item User CPU time: 36 minutes and 52.313 seconds + \item System CPU time: 1 minute and 41.258 seconds +\end{itemize} + +\section{PyTorch Training Pipeline} +This section details the Python implementation of the text classification training pipeline. The code mimics the architecture and logic of the Rust version to ensuring comparable performance and behavior. +\subsection{Code Highlights} +\begin{itemize} + \item \textbf{Custom Transformer Model:} + The \texttt{TextClassificationModel} is a custom \texttt{nn.Module} containing: + \begin{itemize} + \item Dual embedding layers (\texttt{embedding\_token} and \texttt{embedding\_pos}). + \item A unique fusion strategy averaging the two embeddings: $E = (E_{pos} + E_{tok}) / 2$. + \item A standard \texttt{TransformerEncoder} stack. + \item A classification head that projects the encoded features to the 4 output classes of the AG News dataset. + \end{itemize} + \item \textbf{Noam Learning Rate Scheduler:} + A custom \texttt{NoamLR} scheduler is implemented to replicate the specific warmup and decay behavior used in the Rust implementation (and the original "Attention Is All You Need" paper). + \[ + lr = \text{factor} \cdot (d_{model}^{-0.5}) \cdot \min(step^{-0.5}, step \cdot warmup^{-1.5}) + \] + This ensures stable training dynamics for the Transformer architecture. + \item \textbf{Dataset Handling:} + The code utilizes the Hugging Face \texttt{datasets} library to load the "ag\_news" dataset. It explicitly shuffles and subsets the data (50,000 train, 5,000 test) to match the constraints applied in the Rust implementation, ensuring a fair apples-to-apples comparison between the two languages. + \item \textbf{Collate Function with Padding Masks:} + A custom \texttt{collate\_fn} handles dynamic batching. It tokenizes text using the \texttt{bert-base-cased} tokenizer and generates a boolean padding mask. Note the inversion logic: PyTorch's \texttt{TransformerEncoder} expects \texttt{True} for padded positions (unlike some other implementations where 1 implies validity), requiring careful mask generation: + \begin{verbatim} + mask_pad = (encoding['attention_mask'] == 0) + \end{verbatim} + \item \textbf{Training Loop:} + The training loop is a standard PyTorch implementation using \texttt{tqdm} for progress tracking. It uses \texttt{CrossEntropyLoss} as the criterion and the \texttt{Adam} optimizer. Crucially, the scheduler step is called after every batch (not every epoch), consistent with the Noam schedule requirements. +\end{itemize} + + +\section{Python: Text Classification (News) Transformer Model} + +The model is a Transformer-based architecture designed for multi-class news classification. It consists of a multi-layer encoder with multi-head self-attention and feedforward networks. + +\textbf{Model Architecture:} + +\begin{verbatim} +Model { + transformer_encoder: { + d_model: 256, + nhead: 8, + num_layers: 4, + dim_feedforward: 1024 + } + max_seq_len: 256 + num_classes: 4 + total params: 10649092 +} +\end{verbatim} + +The model was trained for 5 epochs and showed steady convergence across all evaluation metrics. + +Training accuracy improved from 56.19\% to 79.70\%, while validation accuracy increased from 68.14\% to 79.00\%. +Training loss decreased from 1.0145 to 0.5483, and validation loss reduced from 0.8137 to 0.5628. + +The model achieved a macro F1-score of 0.7903 on the validation/test set, indicating reasonably strong classification performance for a Transformer trained over a small number of epochs. + +\begin{table}[h] +\centering +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min} & \textbf{Epoch} & \textbf{Max} & \textbf{Epoch} \\ +\hline + +Train & Accuracy & 56.19 & 1 & 79.70 & 5 \\ +Train & Loss & 0.5483 & 5 & 1.0145 & 1 \\ +Train & Grad Norm (Total) & 10.39 & 4 & 22.95 & 3 \\ +Train & Iteration Speed (it/s) & 44.33 & 2 & 45.05 & 1 \\ +Train & CPU Memory (GB) & 1.12 & -- & 1.12 & -- \\ +Train & CPU Usage (\%) & 22.2 & -- & 23.5 & -- \\ +\hline + +Valid & Accuracy & 68.14 & 1 & 79.00 & 5 \\ +Valid & Loss & 0.5628 & 5 & 0.8137 & 1 \\ +Valid & Precision@Top1 [Macro] & 0.7187 & 1 & 0.7942 & 5 \\ +Valid & Recall@Top1 [Macro] & 0.6815 & 1 & 0.7899 & 5 \\ +Valid & F1-Score@Top1 [Macro] & 0.6773 & 1 & 0.7903 & 5 \\ +Valid & Iteration Speed (it/s) & 44.33 & 2 & 45.05 & 1 \\ +Valid & CPU Memory (GB) & 1.12 & -- & 1.12 & -- \\ +Valid & CPU Usage (\%) & 22.2 & -- & 23.5 & -- \\ +\hline + +\end{tabular} +\caption{Transformer Text Classification Training and Validation Metrics} +\end{table} + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 706.99 seconds + \item Average Epoch Time: 139.83 seconds + \item Iteration Speed (Mean): 44.70 it/s + \item Gradient Norm (Mean): 15.75 + \item GPU Memory Usage: 178.79 MB + \item NaN Events: 0 + \item Convergence: Monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + +The results indicate stable training and consistent improvement across epochs. While performance is lower than simpler CNN-based tasks, this is expected due to the increased complexity of natural language understanding tasks. + +\section{PyTorch Inference Pipeline Docker Image: Hybrid NFS and Docker Inference Architecture} + +This section details the hybrid deployment strategy designed to optimize Docker image size and leverage a centralized machine learning environment. The architecture splits the responsibilities between a \textbf{Library VM} (storage-heavy) and a \textbf{Docker VM} (compute-centric). + +\subsection{Architecture Overview} + +The system comprises two primary components: +\begin{enumerate} + \item \textbf{Library VM (NFS Server)}: Hosts the heavy Python environment, including PyTorch, Transformers, and CUDA libraries. This environment is exported via NFS. + \item \textbf{Docker VM (Inference Client)}: Runs a lightweight Docker container that mounts the external libraries at runtime. +\end{enumerate} + +\subsection{Implementation Details} + +\subsubsection{1. Library Sharing via NFS} +The Library VM exports the directory containing the Python site-packages. On the Docker VM, this directory is mounted using the \texttt{mount\_libs.sh} script. + +\begin{lstlisting}[language=bash, caption={Mounting the NFS Library Volume}] +# Configuration from mount_libs.sh +NFS_SERVER_IP="172.16.203.14" +NFS_EXPORT_PATH="/home/iiitb/Documents/textClassificationVolume" +LOCAL_MOUNT_POINT="/mnt/text-libs" + +# Mounting the remote volume +sudo mount -t nfs "$NFS_SERVER_IP:$NFS_EXPORT_PATH" "$LOCAL_MOUNT_POINT" +\end{lstlisting} + +\subsubsection{2. Lightweight Docker Image} +The Docker image is built using \texttt{Dockerfile.cpu} and excludes heavy ML libraries. It only contains the application code, the model weights, and minimal system dependencies. + +\begin{lstlisting}[language=Dockerfile, caption={Dockerfile.cpu Configuration}] +FROM python:3.12-slim + +# Point Python to the external NFS mount +ENV PYTHONPATH=/external-libs/text_env/lib/python3.12/site-packages + +# Copy only the app and model +COPY app.py ./ +COPY model_pytorch_text_classification/ag_news_model.pth ./model/ + +# No 'pip install torch' is performed here! +\end{lstlisting} + +\subsubsection{3. Runtime Execution} +The container is launched via \texttt{run\_inference.sh}, which mounts the NFS volume into the container at \texttt{/external-libs}. + +\begin{lstlisting}[language=bash, caption={Mounting the NFS Library Volume}] +docker run --gpus all \ + -v /mnt/text-libs:/external-libs \ + -v text_model_vol:/models \ + -e PYTHONPATH=/external-libs/text_env/lib/python3.12/site-packages \ + -p 8000:8000 \ + text_classification_image +\end{lstlisting} + +\subsection{Impact on Image Size} + +This architecture drastically reduces the storage footprint of the inference artifact. By decoupling the static libraries from the application logic, we achieve the following reduction: + +% \begin{table}[h] +% \centering +% \begin{tabular}{|l|c|c|} +% \hline +% \textbf{Component} & \textbf{Traditional Approach} & \textbf{Hybrid NFS Approach} \\ \hline +% Base Image (Python Slim) & $\sim$150 MB & $\sim$150 MB \\ \hline +% PyTorch & $\sim$3.5 GB & \textbf{0 MB (Mounted)} \\ \hline +% Transformers & $\sim$500 MB & \textbf{0 MB (Mounted)} \\ \hline +% Application Code & $<1$ MB & $<1$ MB \\ \hline +% Model Weights & $\sim$100 MB & $\sim$100 MB \\ \hline +% \textbf{Total Image Size} & \textbf{8.93 GB} & \textbf{$~250$ MB} \\ \hline +% \end{tabular} +% \caption{Comparison of Docker Image Sizes} +% \end{table} + +% This \textbf{99.03\% reduction} in image size results in: +% \begin{itemize} +% \item Faster deployment and rollback times. +% \item Significantly lower network bandwidth usage. +% \item Efficient storage utilization on the Docker VM. +% \end{itemize} + +\section{Hybrid Inference Architecture with NFS and Docker} + +This section outlines the architectural design of our hybrid machine learning deployment strategy, detailing the distinct roles of the Library VM and the Docker VM, and how they interact to optimize resource usage. + +\subsection{Library Virtual Machine (NFS Server)} + +The \textbf{Library VM} serves as the centralized repository for the heavy components of the machine learning environment. Its primary function is to host large, static dependencies such as the Python runtime environment, deep learning frameworks (e.g., PyTorch, TensorFlow), and specialized libraries (e.g., Transformers, CUDA routines). + +By consolidating these resource-intensive libraries on a single machine, we avoid the redundancy of installing them on every inference node. This machine acts as a Network File System (NFS) server, exporting its directory structure to be accessed by other machines in the network. + +\subsubsection{What is an NFS Server?} + +A \textbf{Network File System (NFS)} server is a computer that allows other machines (clients) to access its files over a network as if they were stored locally. In our architecture, the NFS server "shares" the directory containing the Python libraries. The client machines can then read these files directly, eliminating the need to physically copy the heavy libraries to each client. + +\subsection{Docker Virtual Machine (Inference Node)} + +The \textbf{Docker VM} is the compute-centric node responsible for executing the inference workload. It hosts the Docker engine and runs the lightweight containerized application. + +This machine does not permanently store the heavy ML libraries. instead, it mounts the shared directory from the Library VM at runtime. reliable network connectivity to the Library VM ensures that the Docker container has immediate access to the necessary software dependencies. + +\subsection{Hybrid Deployment Strategy} + +The hybrid strategy combines the isolation and portability of Docker with the efficiency of centralized storage. + +\begin{enumerate} + \item \textbf{Decoupling Environment and Application}: We separate the rapidly changing application code (API logic, business rules) from the slowly changing environment (Python packages). The application code resides inside the Docker image, while the environment resides on the NFS share. + \item \textbf{Runtime Linking}: When the Docker container starts, it mounts the NFS share. The container's environment variables are configured to add this mounted path to its Python path. This allows the Python interpreter inside the container to import modules (like \texttt{torch} or \texttt{transformers}) from the network share as if they were installed locally. + \item \textbf{Drastic Image Reduction}: Since the Docker image only contains the application code and minimal system dependencies, its size is reduced from several gigabytes to a few hundred megabytes. This facilitates rapid deployments, faster scaling, and reduced storage costs. +\end{enumerate} + +This architecture essentially transforms the Docker container into a lightweight "shell" that borrows its heavy "engine" from the Library VM only when needed. + +\subsection{Identifying the Virtual Machine Roles} + +The architecture explicitly designates two separate machines for distinct purposes. Based on the configuration scripts, their roles are defined as follows: + +\subsubsection{1. The Library VM (Environment Host)} +This machine acts as the \textbf{storage backend} for the machine learning environment. +\begin{itemize} + \item \textbf{Role}: It hosts the actual Python environment (Torch, Transformers, etc.) on its local filesystem and exports it via NFS. + \item \textbf{Identifier}: In our configuration (see \texttt{mount\_libs.sh}), this machine is identified by the IP address \texttt{172.16.203.14}. + \item \textbf{Key Path}: The environment resides at \texttt{/home/iiitb/Documents/textClassificationVolume}. + \item \textbf{Action}: It does \textit{not} run the Docker container. Ideally, it simply stays online to serve files to other machines. +\end{itemize} + +\subsubsection{2. The Docker VM (Inference Runner)} +This machine acts as the \textbf{compute frontend} that serves the API. +\begin{itemize} + \item \textbf{Role}: It builds and runs the lightweight Docker container. It does not have the deep learning libraries installed on its own disk; it borrows them from the Library VM. + \item \textbf{Identifier}: This is the machine where you execute the \texttt{mount\_libs.sh} and \texttt{run\_inference.sh} scripts. + \item \textbf{Key Path}: It mounts the remote library to the local path \texttt{/mnt/text-libs}. + \item \textbf{Action}: It executes the \texttt{docker run} command, effectively "bringing the code to the data" (or in this case, bringing the library data to the code container). +\end{itemize} + +\begin{table}[h] +\centering +\begin{tabular}{|l|l|l|} +\hline +\textbf{Feature} & \textbf{Library VM} & \textbf{Docker VM} \\ \hline +\textbf{Primary Function} & Storage \& NFS Server & Model Inference \& API Hosting \\ \hline +\textbf{IP Address} & \texttt{172.16.203.14} & (Assigned by Network) \\ \hline +\textbf{Python Libs} & Stored Physically on Disk & Mounted via Network (NFS) \\ \hline +\textbf{Docker Image} & Not required & Builds \& Runs Lightweight Image \\ \hline +\end{tabular} +\caption{Distinction between Library VM and Docker VM} +\end{table} + + + + +\newpage + +\section{Container Comparison: Python vs Rust} + +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 4.02 GB & $\sim$1 GB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + + +\section{Model Size Comparison} + +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + + +\end{document} diff --git a/python_ml/benchmark/bench_lstm.py b/python_ml/benchmark/bench_lstm.py new file mode 100644 index 0000000..1ec8631 --- /dev/null +++ b/python_ml/benchmark/bench_lstm.py @@ -0,0 +1,15 @@ +from locust import HttpUser, task, between +import numpy as np + +IP = "127.0.0.1" +PORT = "9050" +SEQ_LEN = 4 + +class LoadTestprofile(HttpUser): + wait_time = between(0.1,0.2) + host = f"http://{IP}:{PORT}" + rng = rng = np.random.default_rng() + @task + def load_task(self): + random_list = self.rng.uniform(0.0, 10.0, SEQ_LEN).tolist() + _ = self.client.post("/predict",json=random_list) \ No newline at end of file diff --git a/python_ml/benchmark/bench_regression.py b/python_ml/benchmark/bench_regression.py new file mode 100644 index 0000000..8f0e019 --- /dev/null +++ b/python_ml/benchmark/bench_regression.py @@ -0,0 +1,44 @@ +from locust import HttpUser, task, between +import pandas as pd +import numpy as np + +IP = "127.0.0.1" +PORT = "9050" +PATH_TO_DSET = "test_data/regression/cali_housing.parquet" + +''' +# download outside the script + +import pandas as pd + +df = pd.read_parquet( + "hf://datasets/gvlassis/california_housing/data/test-00000-of-00001.parquet" +) +df.to_parquet("cali_housing.parquet") +''' + +class LoadTestprofile(HttpUser): + wait_time = between(0.1,0.2) + host = f"http://{IP}:{PORT}" + + @task + def load_task(self): + random_row = self.df.iloc[np.random.randint(0,len(self.df))] + text_to_send = { + 'MedInc' : random_row['MedInc'], + 'HouseAge' : random_row['HouseAge'], + 'AveRooms' : random_row['AveRooms'], + 'AveBedrms' : random_row['AveBedrms'], + 'Population' : random_row['Population'], + 'AveOccup' : random_row['AveOccup'], + 'Latitude' : random_row['Latitude'], + 'Longitude' : random_row['Longitude'], + 'MedHouseVal' : random_row['MedHouseVal'] + } + _ = self.client.post("/predict",json=text_to_send) + # is_correct = (response.json()['prediction'] == self.dict_class[random_row['label']]) + + + + def on_start(self): + self.df = pd.read_parquet(PATH_TO_DSET) \ No newline at end of file diff --git a/python_ml/benchmark/bench_text_class.py b/python_ml/benchmark/bench_text_class.py index 4dc1347..3423e7c 100644 --- a/python_ml/benchmark/bench_text_class.py +++ b/python_ml/benchmark/bench_text_class.py @@ -27,22 +27,22 @@ def load_task(self): text_to_send = { 'text' : random_row['text'] } - response = self.client.post("/predict",json=text_to_send) - is_correct = (response.json()['prediction'] == self.dict_class[random_row['label']]) - events.request.fire( - request_type="ML", - name="accuracy", - response_time=0, - response_length=1, - exception=None if is_correct else Exception("wrong"), - ) + _ = self.client.post("/predict",json=text_to_send) + # is_correct = (response.json()['prediction'] == self.dict_class[random_row['label']]) + # events.request.fire( + # request_type="ML", + # name="accuracy", + # response_time=0, + # response_length=1, + # exception=None if is_correct else Exception("wrong"), + # ) def on_start(self): self.df = pd.read_parquet(PATH_TO_DSET) - self.dict_class = { - 0 : "World", - 1 : "Sports", - 2 : "Business", - 3 : "Technology" - } \ No newline at end of file + # self.dict_class = { + # 0 : "World", + # 1 : "Sports", + # 2 : "Business", + # 3 : "Technology" + # } \ No newline at end of file diff --git a/python_ml/pytorch/regression/Inference/app.py b/python_ml/pytorch/regression/Inference/app.py index 5e4ea49..366e1a1 100644 --- a/python_ml/pytorch/regression/Inference/app.py +++ b/python_ml/pytorch/regression/Inference/app.py @@ -11,7 +11,10 @@ # Constants (must match training) # ========================================================== -NUM_FEATURES = 13 +NUM_FEATURES = 8 + +FEATURES_MIN = torch.tensor([0.4999, 1., 0.8461, 0.375, 3., 0.6923, 32.54, -124.35], dtype=torch.float32) +FEATURES_MAX = torch.tensor([15., 52., 141.9091, 34.0667, 35682., 1243.3333, 41.95, -114.31], dtype=torch.float32) GENERATED_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "generated") @@ -81,12 +84,13 @@ class HousingFeatures(BaseModel): def preprocess(features): - x = np.array(features, dtype=np.float32) + x = torch.tensor(features, dtype=torch.float32) if len(x) != NUM_FEATURES: raise ValueError(f"Expected {NUM_FEATURES} features") - x = torch.tensor(x).unsqueeze(0) + x = x.unsqueeze(0) + x = (x - FEATURES_MIN) / (FEATURES_MAX - FEATURES_MIN) return x diff --git a/python_ml/pytorch/regression/Training/training.py b/python_ml/pytorch/regression/Training/training.py index 096b368..78e5b2a 100644 --- a/python_ml/pytorch/regression/Training/training.py +++ b/python_ml/pytorch/regression/Training/training.py @@ -3,10 +3,10 @@ import shutil import time import random -import urllib.request import psutil import torch import numpy as np +from datasets import load_dataset from dataclasses import dataclass from torch import nn from torch.utils.data import Dataset, DataLoader @@ -15,49 +15,12 @@ # Constants # ========================================================== -NUM_FEATURES = 13 +NUM_FEATURES = 8 OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "generated") -DATASET_URL = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/boston_housing.npz" -RAW_DATA_FILE = os.path.join(OUTPUT_DIR, "boston_housing.npz") - -TRAIN_FILE = os.path.join(OUTPUT_DIR, "train_data.npz") -VALID_FILE = os.path.join(OUTPUT_DIR, "valid_data.npz") - - -# ========================================================== -# Dataset preparation -# ========================================================== - -def prepare_dataset(): - - os.makedirs(OUTPUT_DIR, exist_ok=True) - - if not os.path.exists(RAW_DATA_FILE): - print("Downloading Boston Housing dataset...") - urllib.request.urlretrieve(DATASET_URL, RAW_DATA_FILE) - print("Download complete.") - - if not os.path.exists(TRAIN_FILE) or not os.path.exists(VALID_FILE): - - data = np.load(RAW_DATA_FILE) - - X = data["x"] - y = data["y"] - - split = int(0.8 * len(X)) - - X_train = X[:split] - y_train = y[:split] - - X_valid = X[split:] - y_valid = y[split:] - - np.savez(TRAIN_FILE, x=X_train, y=y_train) - np.savez(VALID_FILE, x=X_valid, y=y_valid) - - print("Dataset prepared.") +FEATURES_MIN = torch.tensor([0.4999, 1., 0.8461, 0.375, 3., 0.6923, 32.54, -124.35], dtype=torch.float32) +FEATURES_MAX = torch.tensor([15., 52., 141.9091, 34.0667, 35682., 1243.3333, 41.95, -114.31], dtype=torch.float32) # ========================================================== @@ -66,9 +29,28 @@ def prepare_dataset(): class HousingDataset(Dataset): - def __init__(self, inputs, targets): - self.inputs = torch.tensor(inputs, dtype=torch.float32) + def __init__(self, split="train"): + hf_dataset = load_dataset("gvlassis/california_housing", split=split) + + # Extract features in the correct order: + # MedInc, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude + features = [ + hf_dataset["MedInc"], + hf_dataset["HouseAge"], + hf_dataset["AveRooms"], + hf_dataset["AveBedrms"], + hf_dataset["Population"], + hf_dataset["AveOccup"], + hf_dataset["Latitude"], + hf_dataset["Longitude"] + ] + + targets = hf_dataset["MedHouseVal"] + + inputs = torch.tensor(features, dtype=torch.float32).T self.targets = torch.tensor(targets, dtype=torch.float32) + + self.inputs = (inputs - FEATURES_MIN) / (FEATURES_MAX - FEATURES_MIN) def __len__(self): return len(self.inputs) @@ -78,21 +60,11 @@ def __getitem__(self, idx): @staticmethod def train(): - - prepare_dataset() - - data = np.load(TRAIN_FILE) - - return HousingDataset(data["x"], data["y"]) + return HousingDataset("train") @staticmethod def validation(): - - prepare_dataset() - - data = np.load(VALID_FILE) - - return HousingDataset(data["x"], data["y"]) + return HousingDataset("validation") # ========================================================== diff --git a/reports/rust/lstm.html b/reports/rust/lstm.html new file mode 100644 index 0000000..43ae5e7 --- /dev/null +++ b/reports/rust/lstm.html @@ -0,0 +1,124 @@ + + + + + + + + + + + Locust + + + + +
+ + + + + \ No newline at end of file diff --git a/reports/rust/mnist.html b/reports/rust/mnist.html new file mode 100644 index 0000000..6f73a2f --- /dev/null +++ b/reports/rust/mnist.html @@ -0,0 +1,124 @@ + + + + + + + + + + + Locust + + + + +
+ + + + + \ No newline at end of file diff --git a/reports/rust/regression.html b/reports/rust/regression.html new file mode 100644 index 0000000..0c554d7 --- /dev/null +++ b/reports/rust/regression.html @@ -0,0 +1,124 @@ + + + + + + + + + + + Locust + + + + +
+ + + + + \ No newline at end of file diff --git a/reports/rust/text_class.html b/reports/rust/text_class.html new file mode 100644 index 0000000..e9417ab --- /dev/null +++ b/reports/rust/text_class.html @@ -0,0 +1,124 @@ + + + + + + + + + + + Locust + + + + +
+ + + + + \ No newline at end of file diff --git a/rust_ml/Cargo.toml b/rust_ml/Cargo.toml index a24c637..ea509eb 100644 --- a/rust_ml/Cargo.toml +++ b/rust_ml/Cargo.toml @@ -1,6 +1,6 @@ [workspace] resolver = "3" -members = ["lstm_train","mnist_infer","mnist_ml", "regression", "text_classification_infer", "text_classification_news", "text_gen_train"] +members = ["lstm_inference","lstm_train","mnist_infer","mnist_ml", "regression", "regression_inference", "text_classification_infer", "text_classification_news", "text_gen_train"] [workspace.lints.clippy] all = "warn" diff --git a/rust_ml/Dockerfile.lstm_inf b/rust_ml/Dockerfile.lstm_inf new file mode 100644 index 0000000..da26e46 --- /dev/null +++ b/rust_ml/Dockerfile.lstm_inf @@ -0,0 +1,44 @@ +# ----------------------- +# Build Stage +# ----------------------- +FROM ubuntu:16.04 AS builder + +WORKDIR /app/rust_ml + +RUN apt-get update && apt-get install -y \ + curl \ + build-essential \ + pkg-config \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Install Rust +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +ENV PATH="/root/.cargo/bin:${PATH}" + +COPY . . + +# Build the release binary for lstm_inference +RUN cargo build --release -p lstm_inference + +# ----------------------- +# Runtime Stage +# ----------------------- +FROM nvidia/vulkan:1.3-470 + +WORKDIR /app + +# Copy compiled binary from builder +COPY --from=builder /app/rust_ml/target/release/lstm_inference /app/binary + +COPY ./model/lstm_train/config.json /app/model/lstm_train/config.json +COPY ./model/lstm_train/model.mpk /app/model/lstm_train/model.mpk + +# Environment variables +ENV RUST_LOG=info +# Setup the default mounted model path +ENV MODEL_DIR=/app/model/lstm_train + +EXPOSE 9050 + +CMD ["./binary"] diff --git a/rust_ml/Dockerfile.regression_inf b/rust_ml/Dockerfile.regression_inf new file mode 100644 index 0000000..2d7c027 --- /dev/null +++ b/rust_ml/Dockerfile.regression_inf @@ -0,0 +1,44 @@ +# ----------------------- +# Build Stage +# ----------------------- +FROM ubuntu:16.04 AS builder + +WORKDIR /app/rust_ml + +RUN apt-get update && apt-get install -y \ + curl \ + build-essential \ + pkg-config \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Install Rust +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +ENV PATH="/root/.cargo/bin:${PATH}" + +# Copy workspace +# This assumes the build context is the `rust_ml` root directory. +COPY . . + +# Build the release binary for regression_inference +RUN cargo build --release -p regression_inference + +# ----------------------- +# Runtime Stage +# ----------------------- +FROM nvidia/vulkan:1.3-470 + +WORKDIR /app + +# Copy compiled binary from builder +COPY --from=builder /app/rust_ml/target/release/regression_inference /app/binary + +COPY ./model/regression_train/model.bin /app/model/regression_train/model.bin +# Environment variables +ENV RUST_LOG=info +# Setup the default mounted model path +ENV MODEL_PATH=/app/model/regression_train/model.bin + +EXPOSE 9050 + +CMD ["./binary"] diff --git a/rust_ml/lstm_inference/Cargo.toml b/rust_ml/lstm_inference/Cargo.toml new file mode 100644 index 0000000..816d3e7 --- /dev/null +++ b/rust_ml/lstm_inference/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "lstm_inference" +version = "0.1.0" +edition = "2024" + +[dependencies] +lstm_train = { path = "../lstm_train" } +burn = { version = "~0.20", features = ["std", "wgpu"], default-features = false } +axum = "0.8" +tokio = { version = "1", features = ["full"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +tower-http = { version = "0.6", features = ["cors", "trace"] } + +[lints] +workspace = true diff --git a/rust_ml/lstm_inference/src/inference.rs b/rust_ml/lstm_inference/src/inference.rs new file mode 100644 index 0000000..7f618fe --- /dev/null +++ b/rust_ml/lstm_inference/src/inference.rs @@ -0,0 +1,56 @@ +use axum::{ + extract::State, + http::StatusCode, + Json, +}; +use burn::data::dataloader::batcher::Batcher; +use lstm_train::dataset::{SequenceBatcher, SequenceDatasetItem}; +use serde::Serialize; + +use crate::state::{AppState, MyBackend}; + +#[derive(Serialize)] +pub struct PredictResponse { + pub predicted_next_value: f32, +} + +#[derive(Serialize)] +pub struct ErrorResponse { + pub error: String, +} + +pub async fn predict_handler( + State(state): State, + Json(payload): Json>, +) -> Result, (StatusCode, Json)> { + let device: ::Device = Default::default(); + + // Explicitly construct the dataset mapping bypassing manual target generation for clients + let item = SequenceDatasetItem { + sequence: payload, + target: 0.0, + }; + + // Create batcher mapped to backend + let batcher = SequenceBatcher::default(); + + // Process item into batched tensors + let batch = batcher.batch(vec![item], &device); + + // Perform forward pass inference + let output = state.model.lock().unwrap().forward(batch.sequences, None); + + // Extract single result + let predicted_tensors = output.squeeze_dim::<1>(1).into_data(); + + let predicted_value = predicted_tensors + .as_slice::() + .unwrap_or(&[]) + .first() + .copied() + .unwrap_or(0.0); + + Ok(Json(PredictResponse { + predicted_next_value: predicted_value, + })) +} diff --git a/rust_ml/lstm_inference/src/main.rs b/rust_ml/lstm_inference/src/main.rs new file mode 100644 index 0000000..c71e0da --- /dev/null +++ b/rust_ml/lstm_inference/src/main.rs @@ -0,0 +1,50 @@ +#![recursion_limit = "256"] +mod inference; +mod model; +mod state; + +use axum::{ + routing::{get, post}, + Router, +}; +use std::net::SocketAddr; +use tower_http::cors::CorsLayer; +use tower_http::trace::TraceLayer; +use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; + +use crate::inference::predict_handler; +use crate::state::AppState; + +#[tokio::main] +async fn main() { + // Initialize tracing + tracing_subscriber::registry() + .with(tracing_subscriber::EnvFilter::new( + std::env::var("RUST_LOG").unwrap_or_else(|_| "info".into()), + )) + .with(tracing_subscriber::fmt::layer()) + .init(); + + // Load Model State + let model_dir = std::env::var("MODEL_DIR") + .unwrap_or_else(|_| -> String { "model/lstm_train".to_string() }); + + // Let the AppState construct the pre-loaded memory model + let state = AppState::new(&model_dir); + + // Build Axum Router + let app = Router::new() + .route("/health", get(|| async { "OK" })) + .route("/predict", post(predict_handler)) + .layer(CorsLayer::permissive()) + .layer(TraceLayer::new_for_http()) + .with_state(state); + + // Run Server + let port = 9050; + let addr = SocketAddr::from(([0, 0, 0, 0], port)); + tracing::info!("Server listening on http://{}", addr); + + let listener = tokio::net::TcpListener::bind(addr).await.unwrap(); + axum::serve(listener, app).await.unwrap(); +} diff --git a/rust_ml/lstm_inference/src/model.rs b/rust_ml/lstm_inference/src/model.rs new file mode 100644 index 0000000..9537858 --- /dev/null +++ b/rust_ml/lstm_inference/src/model.rs @@ -0,0 +1,380 @@ +use burn::{ + nn::{ + Dropout, DropoutConfig, Initializer, LayerNorm, LayerNormConfig, Linear, LinearConfig, + LstmState, Sigmoid, Tanh, + }, + prelude::*, +}; + +/// LSTM Cell implementation with layer normalization. +/// +/// Mathematical formulation of LSTM: +/// f_t = σ(W_f · [h_{t-1}, x_t] + b_f) # Forget gate +/// i_t = σ(W_i · [h_{t-1}, x_t] + b_i] # Input gate +/// g_t = tanh(W_g · [h_{t-1}, x_t] + b_g] # Candidate cell state +/// o_t = σ(W_o · [h_{t-1}, x_t] + b_o) # Output gate +/// +/// c_t = f_t ⊙ c_{t-1} + i_t ⊙ g_t # New cell state +/// h_t = o_t ⊙ tanh(c_t) # New hidden state +/// +/// where: +/// - σ is the sigmoid function +/// - ⊙ is the element-wise multiplication +/// - [h_{t-1}, x_t] represents concatenation + +#[derive(Module, Debug)] +pub struct LstmCell { + pub hidden_size: usize, + // Combined weight matrices for efficiency + // weight_ih layer uses combined weights for [i_t, f_t, g_t, o_t] for input x_t + // weight_hh layer uses combined weights for [i_t, f_t, g_t, o_t] for hidden state h_{t-1} + pub weight_ih: Linear, + pub weight_hh: Linear, + // Layer Normalization for better training stability. Don't use BatchNorm because the input distribution is always changing for LSTM. + pub norm_x: LayerNorm, // Normalize gate pre-activations + pub norm_h: LayerNorm, // Normalize hidden state + pub norm_c: LayerNorm, // Normalize cell state + pub dropout: Dropout, +} + +/// Configuration to create a Lstm module using the init function. +#[derive(Config, Debug)] +pub struct LstmCellConfig { + // The size of the input features + pub input_size: usize, + // The size of the hidden state + pub hidden_size: usize, + // The number of hidden layers + pub dropout: f64, +} + +impl LstmCellConfig { + // Initialize parameters using best practices: + // 1. Orthogonal initialization for better gradient flow (here we use Xavier because of the lack of Orthogonal in burn) + // 2. Initialize forget gate bias to 1.0 to prevent forgetting at start of training + #[allow(clippy::single_range_in_vec_init)] + pub fn init( + &self, + device: &B::Device, + ) -> LstmCell { + let initializer = Initializer::XavierNormal { gain: 1.0 }; + let init_bias = Tensor::::ones([self.hidden_size], device); + + let mut weight_ih = LinearConfig::new(self.input_size, 4 * self.hidden_size) + .with_initializer(initializer.clone()) + .init(device); + // Set forget gate bias to 1.0 (helps with learning long sequences) + let bias = weight_ih + .bias + .clone() + .unwrap() + .val() + .slice_assign([self.hidden_size..2 * self.hidden_size], init_bias.clone()); + weight_ih.bias = weight_ih.bias.map(|p| p.map(|_t| bias)); + + let mut weight_hh = LinearConfig::new(self.hidden_size, 4 * self.hidden_size) + .with_initializer(initializer) + .init(device); + let bias = weight_hh + .bias + .clone() + .unwrap() + .val() + .slice_assign([self.hidden_size..2 * self.hidden_size], init_bias); + weight_hh.bias = weight_hh.bias.map(|p| p.map(|_t| bias)); + + LstmCell { + hidden_size: self.hidden_size, + weight_ih, + weight_hh, + norm_x: LayerNormConfig::new(4 * self.hidden_size).init(device), + norm_h: LayerNormConfig::new(self.hidden_size).init(device), + norm_c: LayerNormConfig::new(self.hidden_size).init(device), + dropout: DropoutConfig::new(self.dropout).init(), + } + } +} + +impl LstmCell { + /// Forward pass of LSTM cell. + /// Args: + /// x: Input tensor of shape (batch_size, input_size) + /// state: Tuple of (h_{t-1}, c_{t-1}) each of shape (batch_size, hidden_size) + /// Returns: + /// Tuple of (h_t, c_t) representing new hidden and cell states + pub fn forward( + &self, + x: Tensor, + state: LstmState, + ) -> LstmState { + let (h_prev, c_prev) = (state.hidden, state.cell); + + // Combined matrix multiplication for all gates + // Shape: (batch_size, 4 * hidden_size) + let gates_x = self.weight_ih.forward(x); // Transform input + let gates_h = self.weight_hh.forward(h_prev); // Transform previous hidden state + + // Apply layer normalization + let gates_x = self.norm_x.forward(gates_x); + // Combined gate pre-activations + let gates = gates_x + gates_h; + + // Split into individual gates + // Each gate shape: (batch_size, hidden_size) + let gates = gates.chunk(4, 1); + let i_gate = gates[0].clone(); + let f_gate = gates[1].clone(); + let g_gate = gates[2].clone(); + let o_gate = gates[3].clone(); + + // Apply gate non-linearities + let i_t = Sigmoid::new().forward(i_gate); + let f_t = Sigmoid::new().forward(f_gate); + let g_t = Tanh::new().forward(g_gate); + let o_t = Sigmoid::new().forward(o_gate); + + // Update cell state: c_t = f_t ⊙ c_{t-1} + i_t ⊙ g_t + let c_t = f_t * c_prev + i_t * g_t; + let c_t = self.norm_c.forward(c_t); + + // Update cell state: h_t = o_t ⊙ tanh(c_t) + let h_t = o_t * Tanh::new().forward(c_t.clone()); + let h_t = self.norm_h.forward(h_t); + + let h_t = self.dropout.forward(h_t); + + LstmState::new(h_t, c_t) + } + + // Initialize cell state and hidden state if provided or with zeros + pub fn init_state( + &self, + batch_size: usize, + device: &B::Device, + ) -> LstmState { + let cell = Tensor::zeros([batch_size, self.hidden_size], device); + let hidden = Tensor::zeros([batch_size, self.hidden_size], device); + + LstmState::new(cell, hidden) + } +} + +/// Stacked LSTM implementation supporting multiple layers +/// Each layer processes the output of the previous layer +#[derive(Module, Debug)] +pub struct StackedLstm { + pub layers: Vec>, +} + +#[derive(Config, Debug)] +pub struct StackedLstmConfig { + pub input_size: usize, + pub hidden_size: usize, + pub num_layers: usize, + pub dropout: f64, +} + +impl StackedLstmConfig { + pub fn init( + &self, + device: &B::Device, + ) -> StackedLstm { + let mut layers: Vec> = vec![]; + // Create list of LSTM cells, one for each layer + for i in 0..self.num_layers { + if i == 0 { + if i < self.num_layers - 1 { + layers.push( + LstmCellConfig::new(self.input_size, self.hidden_size, self.dropout) + .init(device), + ); + } else { + // No dropout on last layer + layers.push( + LstmCellConfig::new(self.input_size, self.hidden_size, 0.0).init(device), + ); + } + } else if i < self.num_layers - 1 { + layers.push( + LstmCellConfig::new(self.hidden_size, self.hidden_size, self.dropout) + .init(device), + ); + } else { + // No dropout on last layer + layers.push( + LstmCellConfig::new(self.hidden_size, self.hidden_size, 0.0).init(device), + ); + } + } + StackedLstm { layers } + } +} + +impl StackedLstm { + /// Process input sequence through stacked LSTM layers. + /// + /// Args: + /// x: Input tensor of shape (batch_size, seq_length, input_size) + /// states: Optional initial states for each layer + /// + /// Returns: + /// Tuple of (output, states) where output has shape (batch_size, seq_length, hidden_size) + /// and states is a vector of length num_layers, both cell and hidden state in each element have shape (batch_size, hidden_size) + pub fn forward( + &self, + x: Tensor, + states: Option>>, + ) -> (Tensor, Vec>) { + let [batch_size, seq_length, _] = x.dims(); + let device = x.device(); + + let mut states = match states { + None => { + let mut temp: Vec> = vec![]; + for layer in self.layers.iter() { + temp.push(layer.init_state(batch_size, &device)); + } + temp + } + _ => states.unwrap(), + }; + + let mut layer_outputs = vec![]; + for t in 0..seq_length { + let mut input_t = x.clone().slice(s![.., t..t + 1, ..]).squeeze_dim::<2>(1); + for (i, lstm_cell) in self.layers.iter().enumerate() { + let mut state: LstmState = + LstmState::new(states[i].cell.clone(), states[i].hidden.clone()); + state = lstm_cell.forward(input_t, state); + input_t = state.hidden.clone(); + states[i] = state; + } + layer_outputs.push(input_t); + } + + // Stack output along sequence dimension + let output = Tensor::stack(layer_outputs, 1); + + (output, states) + } +} + +/// Complete LSTM network with bidirectional support. +/// +/// In bidirectional mode: +/// - Forward LSTM processes sequence from left to right +/// - Backward LSTM processes sequence from right to left +/// - Outputs are concatenated for final prediction +#[derive(Module, Debug)] +pub struct LstmNetwork { + // Forward direction LSTM + pub stacked_lstm: StackedLstm, + // Optional backward direction LSTM for bidirectional processing + pub reverse_lstm: Option>, + pub dropout: Dropout, + pub fc: Linear, +} + +#[derive(Config, Debug)] +pub struct LstmNetworkConfig { + #[config(default = 1)] + pub input_size: usize, // Single feature (number sequence) + #[config(default = 32)] + pub hidden_size: usize, // Size of LSTM hidden state + #[config(default = 2)] + pub num_layers: usize, // Number of LSTM layers + #[config(default = 1)] + pub output_size: usize, // Predict one number + #[config(default = 0.1)] + pub dropout: f64, + #[config(default = true)] + pub bidirectional: bool, // Use bidirectional LSTM +} + +impl LstmNetworkConfig { + pub fn init( + &self, + device: &B::Device, + ) -> LstmNetwork { + // Forward direction LSTM + let stacked_lstm = StackedLstmConfig::new( + self.input_size, + self.hidden_size, + self.num_layers, + self.dropout, + ) + .init(device); + + // Optional backward direction LSTM for bidirectional processing + let (reverse_lstm, hidden_size) = if self.bidirectional { + let lstm = StackedLstmConfig::new( + self.input_size, + self.hidden_size, + self.num_layers, + self.dropout, + ) + .init(device); + (Some(lstm), 2 * self.hidden_size) + } else { + (None, self.hidden_size) + }; + + let fc = LinearConfig::new(hidden_size, self.output_size).init(device); + let dropout = DropoutConfig::new(self.dropout).init(); + + LstmNetwork { + stacked_lstm, + reverse_lstm, + dropout, + fc, + } + } +} + +impl LstmNetwork { + /// Forward pass of the network. + /// + /// For bidirectional processing: + /// 1. Process sequence normally with forward LSTM + /// 2. Process reversed sequence with backward LSTM + /// 3. Concatenate both outputs + /// 4. Apply final linear transformation + /// + /// Args: + /// x: Input tensor of shape (batch_size, seq_length, input_size) + /// states: Optional initial states + /// + /// Returns: + /// Output tensor of shape (batch_size, output_size) + pub fn forward( + &self, + x: Tensor, + states: Option>>, + ) -> Tensor { + let seq_length = x.dims()[1]; + // Forward direction + let (mut output, _states) = self.stacked_lstm.forward(x.clone(), states); + + output = match &self.reverse_lstm { + Some(reverse_lstm) => { + //Process sequence in reverse direction + let (mut reverse_output, _states) = reverse_lstm.forward(x.flip([1]), None); + // Flip back to align with forward sequence + reverse_output = reverse_output.flip([1]); + // Concatenate forward and backward outputs along the feature dimension + output = Tensor::cat(vec![output, reverse_output], 2); + output + } + None => output, + }; + + // Apply dropout before final layer + output = self.dropout.forward(output); + // Use final timestep output for prediction + self.fc.forward( + output + .slice(s![.., seq_length - 1..seq_length, ..]) + .squeeze_dim::<2>(1), + ) + } +} diff --git a/rust_ml/lstm_inference/src/state.rs b/rust_ml/lstm_inference/src/state.rs new file mode 100644 index 0000000..ec06bff --- /dev/null +++ b/rust_ml/lstm_inference/src/state.rs @@ -0,0 +1,44 @@ +use burn::{ + backend::Wgpu, + module::Module, + prelude::Config, + record::{CompactRecorder, Recorder}, +}; +use crate::model::{LstmNetwork, LstmNetworkConfig}; +use std::sync::{Arc, Mutex}; + +pub type MyBackend = Wgpu; + +#[derive(Config,Debug)] +pub struct InferenceConfig { + pub model: LstmNetworkConfig, +} + +#[derive(Clone)] +pub struct AppState { + pub model: Arc>>, +} + +impl AppState { + pub fn new(model_dir: &str) -> Self { + let device = Default::default(); + + let config_path = format!("{}/config.json", model_dir); + let model_path = format!("{}/model", model_dir); + + // Load training configuration + let config = InferenceConfig::load(&config_path) + .expect("Config should exist for the model; run train first"); + + // Load model configuration and initialized layers + let record = CompactRecorder::new() + .load(model_path.into(), &device) + .expect("Trained model should exist; run train first"); + + let model: LstmNetwork = config.model.init(&device).load_record(record); + + Self { + model: Arc::new(Mutex::new(model)), + } + } +} diff --git a/rust_ml/mnist_ml/src/main.rs b/rust_ml/mnist_ml/src/main.rs index 03c994e..ac10cc6 100644 --- a/rust_ml/mnist_ml/src/main.rs +++ b/rust_ml/mnist_ml/src/main.rs @@ -1,3 +1,4 @@ +#![recursion_limit = "256"] mod data; mod model; mod training; diff --git a/rust_ml/regression_inference/Cargo.toml b/rust_ml/regression_inference/Cargo.toml new file mode 100644 index 0000000..afea4d1 --- /dev/null +++ b/rust_ml/regression_inference/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "regression_inference" +version = "0.1.0" +edition = "2024" + +[dependencies] +regression = { path = "../regression" } +burn = { version = "~0.20", features = ["std", "wgpu"], default-features = false } +axum = "0.8" +tokio = { version = "1", features = ["full"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +tower-http = { version = "0.6", features = ["cors", "trace"] } + +[lints] +workspace = true diff --git a/rust_ml/regression_inference/src/inference.rs b/rust_ml/regression_inference/src/inference.rs new file mode 100644 index 0000000..37f985a --- /dev/null +++ b/rust_ml/regression_inference/src/inference.rs @@ -0,0 +1,51 @@ +use axum::{ + extract::State, + http::StatusCode, + Json, +}; +use burn::data::dataloader::batcher::Batcher; +use regression::dataset::{HousingBatcher, HousingDistrictItem}; +use serde::Serialize; + +use crate::state::{AppState, Backend}; + +#[derive(Serialize)] +pub struct PredictResponse { + pub predicted_median_house_value: f32, +} + +#[derive(Serialize)] +pub struct ErrorResponse { + pub error: String, +} + +pub async fn predict_handler( + State(state): State, + Json(payload): Json, +) -> Result, (StatusCode, Json)> { + let device: ::Device = Default::default(); + + // Create batcher mapped to backend + let batcher = HousingBatcher::::new(device.clone()); + + // Process item + // Note: HousingBatcher::batch transforms a Vec into a HousingBatch + let batch = batcher.batch(vec![payload], &device); + + // Perform forward pass inference + let output = state.model.lock().unwrap().forward(batch.inputs); + + // Extract single result + let predicted_tensors = output.squeeze_dim::<1>(1).into_data(); + + // Assuming `into_data()` gives us Burn's generic `Data`, we extract f32 value. + // Since we batched a single item, it should be the first entry + let predicted_value = predicted_tensors + .iter::() + .next() + .unwrap_or(0.0); + + Ok(Json(PredictResponse { + predicted_median_house_value: predicted_value, + })) +} diff --git a/rust_ml/regression_inference/src/main.rs b/rust_ml/regression_inference/src/main.rs new file mode 100644 index 0000000..d410b9e --- /dev/null +++ b/rust_ml/regression_inference/src/main.rs @@ -0,0 +1,50 @@ +mod inference; +mod model; +mod state; + +use axum::{ + routing::{get, post}, + Router, +}; +use std::net::SocketAddr; +use tower_http::cors::CorsLayer; +use tower_http::trace::TraceLayer; +use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; + +use crate::inference::predict_handler; +use crate::state::AppState; + +#[tokio::main] +async fn main() { + // Initialize tracing + tracing_subscriber::registry() + .with(tracing_subscriber::EnvFilter::new( + std::env::var("RUST_LOG").unwrap_or_else(|_| "info".into()), + )) + .with(tracing_subscriber::fmt::layer()) + .init(); + + // Load Model State + // In Docker, it'll mount to /app/model. mpk is the default extension from burn's NoStdTrainingRecorder + let model_path = std::env::var("MODEL_PATH") + .unwrap_or_else(|_| -> String { "model/regression_train/model.bin".to_string() }); + + // Let the AppState construct the pre-loaded memory model + let state = AppState::new(&model_path); + + // Build Axum Router + let app = Router::new() + .route("/health", get(|| async { "OK" })) + .route("/predict", post(predict_handler)) + .layer(CorsLayer::permissive()) + .layer(TraceLayer::new_for_http()) + .with_state(state); + + // Run Server + let port = 9050; + let addr = SocketAddr::from(([0, 0, 0, 0], port)); + tracing::info!("Server listening on http://{}", addr); + + let listener = tokio::net::TcpListener::bind(addr).await.unwrap(); + axum::serve(listener, app).await.unwrap(); +} diff --git a/rust_ml/regression_inference/src/model.rs b/rust_ml/regression_inference/src/model.rs new file mode 100644 index 0000000..29ffed0 --- /dev/null +++ b/rust_ml/regression_inference/src/model.rs @@ -0,0 +1,49 @@ +use burn::{ + nn::{Linear, LinearConfig, Relu}, + prelude::*, +}; +use regression::dataset::NUM_FEATURES; + +#[derive(Module, Debug)] +pub struct RegressionModel { + input_layer: Linear, + output_layer: Linear, + activation: Relu, +} + +#[derive(Config, Debug)] +pub struct RegressionModelConfig { + #[config(default = 64)] + pub hidden_size: usize, +} + +impl RegressionModelConfig { + pub fn init( + &self, + device: &B::Device, + ) -> RegressionModel { + let input_layer = LinearConfig::new(NUM_FEATURES, self.hidden_size) + .with_bias(true) + .init(device); + let output_layer = LinearConfig::new(self.hidden_size, 1) + .with_bias(true) + .init(device); + + RegressionModel { + input_layer, + output_layer, + activation: Relu::new(), + } + } +} + +impl RegressionModel { + pub fn forward( + &self, + input: Tensor, + ) -> Tensor { + let x = self.input_layer.forward(input); + let x = self.activation.forward(x); + self.output_layer.forward(x) + } +} diff --git a/rust_ml/regression_inference/src/state.rs b/rust_ml/regression_inference/src/state.rs new file mode 100644 index 0000000..c5a8776 --- /dev/null +++ b/rust_ml/regression_inference/src/state.rs @@ -0,0 +1,33 @@ +use burn::{ + backend::Wgpu, + module::Module, + record::{NoStdTrainingRecorder, Recorder}, +}; +use crate::model::{RegressionModel, RegressionModelConfig, RegressionModelRecord}; +use std::sync::{Arc, Mutex}; + +pub type Backend = Wgpu; + +#[derive(Clone)] +pub struct AppState { + pub model: Arc>>, +} + +impl AppState { + pub fn new(model_path: &str) -> Self { + let device = Default::default(); + + // Load model configuration + let record: RegressionModelRecord = NoStdTrainingRecorder::new() + .load(model_path.into(), &device) + .expect("Failed to load model record. Ensure the model is trained."); + + let model = RegressionModelConfig::new() + .init(&device) + .load_record(record); + + Self { + model: Arc::new(Mutex::new(model)), + } + } +}