diff --git a/.continueignore b/.continueignore new file mode 100644 index 00000000..58f22c6e --- /dev/null +++ b/.continueignore @@ -0,0 +1,8 @@ +# .continueignore +**/datasets/** +**/checkpoints/** +**/node_modules/** +*.pt +*.pth +*.png +*.jpg \ No newline at end of file diff --git a/.github/workflows/deploy-staging.yml b/.github/workflows/deploy-staging.yml index d79f6e1f..a7330ef6 100644 --- a/.github/workflows/deploy-staging.yml +++ b/.github/workflows/deploy-staging.yml @@ -3,6 +3,18 @@ name: Deploy Staging Branch on: push: branches: [ staging ] + paths-ignore: + # Ignore changes that don't affect the site + - 'src/assignments/**' + - 'scripts/**' + - 'tests/**' + - 'private/**' + - '**.md' + - '.github/**' + - 'data/**' + - 'pyproject.toml' + - 'pytest.ini' + - 'uv.lock' jobs: deploy: @@ -11,7 +23,7 @@ jobs: - name: Deploy Static Site to Remote Server uses: appleboy/ssh-action@v1.0.3 with: - host: ristoffer.ch + host: direct.ristoffer.ch username: crh key: ${{ secrets.SSH_DEPLOY_KEY }} script: | diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index f3e353a5..da3d9bda 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -3,6 +3,18 @@ name: Deploy to Production on: push: branches: [ main ] + paths-ignore: + # Ignore changes that don't affect the site + - 'src/assignments/**' + - 'scripts/**' + - 'tests/**' + - 'private/**' + - '**.md' + - '.github/**' + - 'data/**' + - 'pyproject.toml' + - 'pytest.ini' + - 'uv.lock' jobs: deploy: @@ -11,7 +23,7 @@ jobs: - name: Deploy to Remote Server uses: appleboy/ssh-action@v1.0.3 with: - host: ristoffer.ch + host: direct.ristoffer.ch username: crh key: ${{ secrets.SSH_DEPLOY_KEY }} script: | diff --git a/.github/workflows/shadow-tester.yml b/.github/workflows/shadow-tester.yml new file mode 100644 index 00000000..1fef1a47 --- /dev/null +++ b/.github/workflows/shadow-tester.yml @@ -0,0 +1,139 @@ +name: Shadow Tester + +on: + repository_dispatch: + types: [run-shadow-tests] + +permissions: + contents: read + pull-requests: write # Needed to comment on PRs + +jobs: + shadow-test: + runs-on: ubuntu-latest + steps: + - name: Checkout Private Repo + uses: actions/checkout@v4 + with: + repository: crheckman/private-vla-foundations + token: ${{ secrets.PRIVATE_REPO_TOKEN }} # PAT with access to private repo + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install Python Dependencies + run: | + pip install pytest torch numpy + + - name: Fetch Student Code from Public PR + env: + PR_NUMBER: ${{ github.event.client_payload.pr_number }} + HEAD_BRANCH: ${{ github.event.client_payload.head_branch }} + HEAD_SHA: ${{ github.event.client_payload.head_sha }} + REPO_URL: ${{ github.event.client_payload.repo_url }} + run: | + echo "Fetching student code from PR #${PR_NUMBER}" + + # Clone the public repo + git clone https://github.com/arpg/vla-foundations.git /tmp/public-repo + cd /tmp/public-repo + + # Fetch the PR branch + git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER} + git checkout pr-${PR_NUMBER} + + # Copy student code to our testing directory + # Copy src/assignments to the current repo + if [ -d "src/assignments" ]; then + cp -r src/assignments/* $GITHUB_WORKSPACE/src/assignments/ || true + fi + + echo "Student code fetched successfully" + + - name: Run Internal Rigorous Tests + id: tests + continue-on-error: true + run: | + # Run pytest with internal tests + pytest tests/internal/ -v --tb=short --maxfail=5 > test_output.txt 2>&1 + TEST_EXIT_CODE=$? + + # Capture output + cat test_output.txt + + # Save exit code for later + echo "exit_code=${TEST_EXIT_CODE}" >> $GITHUB_OUTPUT + + # Exit with the actual test result + exit $TEST_EXIT_CODE + + - name: Prepare Test Summary + if: always() + id: summary + run: | + if [ -f test_output.txt ]; then + # Extract summary from pytest output + SUMMARY=$(tail -20 test_output.txt | grep -E "(PASSED|FAILED|ERROR)" || echo "Test execution completed") + + # Escape newlines for GitHub output + SUMMARY="${SUMMARY//$'\n'/'%0A'}" + echo "summary=${SUMMARY}" >> $GITHUB_OUTPUT + else + echo "summary=No test output available" >> $GITHUB_OUTPUT + fi + + - name: Comment on Public PR - Pass + if: steps.tests.outcome == 'success' + uses: peter-evans/create-or-update-comment@v3 + with: + token: ${{ secrets.GITHUB_TOKEN }} # Default token works for same repo + repository: arpg/vla-foundations + issue-number: ${{ github.event.client_payload.pr_number }} + body: | + ## ✅ Shadow CI: Internal Tests Passed + + Your submission passed all internal rigorous tests! + +
+ Test Summary + + ``` + ${{ steps.summary.outputs.summary }} + ``` + +
+ + --- + *These are hidden internal tests run by the instructor. Your code meets the required standards.* + + - name: Comment on Public PR - Fail + if: steps.tests.outcome == 'failure' + uses: peter-evans/create-or-update-comment@v3 + with: + token: ${{ secrets.GITHUB_TOKEN }} # Default token works for same repo + repository: arpg/vla-foundations + issue-number: ${{ github.event.client_payload.pr_number }} + body: | + ## ❌ Shadow CI: Internal Tests Failed + + Your submission did not pass all internal tests. Please review the feedback and make necessary corrections. + +
+ Test Summary + + ``` + ${{ steps.summary.outputs.summary }} + ``` + +
+ + ### Next Steps: + 1. Review the test failures above + 2. Make corrections to your code + 3. Push updates to your PR branch + 4. Tests will automatically re-run + + --- + *These are hidden internal tests run by the instructor. Contact @crheckman if you need clarification on the failures.* diff --git a/.github/workflows/vla-audit.yml b/.github/workflows/vla-audit.yml index b8237a36..afa5a0d5 100644 --- a/.github/workflows/vla-audit.yml +++ b/.github/workflows/vla-audit.yml @@ -42,7 +42,15 @@ jobs: ### Common Issues: - **1. Semantic Line Breaks** + **1. Required Frontmatter Fields** + - Every audit MDX file must include these fields: + - `title`: Paper title + - `author`: Paper author(s) + - `topic`: Research topic/category + - `paper`: Link to paper or citation + - All fields must have non-empty values (no placeholders like "TBD" or "TODO") + + **2. Semantic Line Breaks** - Each sentence should be on its own line - This makes PR commenting and reviewing much easier - Example: @@ -53,7 +61,7 @@ jobs: + This makes PR review much easier. ``` - **2. Clean Git History** + **3. Clean Git History** - No "Merge branch 'main'" commits allowed - Use `git rebase main` instead of `git merge main` - Keep your commit history linear and clean @@ -144,22 +152,3 @@ jobs: --- *This preview will be removed when the PR is closed.* - - trigger-shadow-tests: - runs-on: ubuntu-latest - if: github.event_name == 'pull_request' && github.base_ref == 'staging' - needs: audit - steps: - - name: Trigger Shadow CI in Private Repo - uses: peter-evans/repository-dispatch@v2 - with: - token: ${{ secrets.PRIVATE_DISPATCH_TOKEN }} - repository: crheckman/private-vla-foundations - event-type: run-shadow-tests - client-payload: | - { - "pr_number": "${{ github.event.pull_request.number }}", - "head_branch": "${{ github.event.pull_request.head.ref }}", - "head_sha": "${{ github.event.pull_request.head.sha }}", - "repo_url": "${{ github.event.pull_request.head.repo.clone_url }}" - } diff --git a/.gitignore b/.gitignore index 3519214c..b0ff8ff2 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,6 @@ pytest.ini PRIVATE_REPO_SETUP.md SETUP_WITH_GH_CLI.md QUICK_REFERENCE.md + +# Ignore generated data +*.pkl diff --git a/README.md b/README.md index 323ea9e7..f83a27ee 100644 --- a/README.md +++ b/README.md @@ -83,103 +83,6 @@ git push --force-with-lease --- -## Repository Structure - -``` -vla-foundations/ -├── app/ # Next.js App Router (web framework) -│ ├── page.tsx # Landing page -│ ├── textbook/[slug]/ # Dynamic chapter pages -│ ├── course/ # Course overview page -│ │ └── assignments/[slug]/ # Dynamic assignment pages -│ └── contributors/[slug]/ # Dynamic contributor profile pages -│ -├── content/ # All MDX content (rendered as web pages) -│ ├── textbook/ # 8-chapter VLA textbook -│ │ ├── foundations/ # Chapter 0: Core concepts -│ │ ├── architectures/ # Chapter 1: Model designs -│ │ ├── data/ # Chapter 2: Dataset construction -│ │ ├── training/ # Chapter 3: Optimization methods -│ │ ├── evaluation/ # Chapter 4: Metrics and benchmarks -│ │ ├── deployment/ # Chapter 5: Production systems -│ │ ├── applications/ # Chapter 6: Real-world use cases -│ │ └── future/ # Chapter 7: Open problems -│ │ -│ ├── course/ # Course materials -│ │ ├── Syllabus.mdx # Course syllabus -│ │ ├── assignments/ # Assignment specifications -│ │ └── submissions/ # Student submission reports -│ │ -│ └── contributors/ # Contributor profiles -│ └── [github-handle].mdx # One profile per contributor -│ -└── src/ # Executable source code - └── assignments/ # Assignment code templates - └── scratch-1/ # Example: Transformer implementation - ├── README.md # Minimal README - ├── backbone.py # Implementation template with TODOs - └── generate_data.py # Dataset generator script -``` - ---- - -## The 8-Chapter Textbook - -0. **Foundations** - Core concepts and problem formulation -1. **Architectures** - Model designs and network topologies -2. **Data** - Dataset construction and curation strategies -3. **Training** - Optimization and fine-tuning methods -4. **Evaluation** - Metrics and benchmarking protocols -5. **Deployment** - Production systems and scaling -6. **Applications** - Real-world use cases and case studies -7. **Future Directions** - Open problems and research frontiers - ---- - -## Development Workflow - -### Initial Setup - -```bash -# Clone the repository -git clone https://github.com/arpg/vla-foundations.git -cd vla-foundations - -# Install dependencies -pnpm install - -# Run development server -pnpm dev -``` - -Navigate to `http://localhost:3000` to see the site. - -### Local Build - -```bash -# Build the static site -pnpm build - -# Preview the production build -pnpm start -``` - ---- - -## Technologies - -### Core -- **Next.js 16**: Static site generation -- **TypeScript**: Type safety -- **Tailwind CSS**: Styling -- **MDX**: Markdown with JSX - -### Content Processing -- **remark-math** + **rehype-katex**: LaTeX rendering -- **remark-gfm**: GitHub-flavored Markdown - ---- - ## Resources ### Documentation diff --git a/app/globals.css b/app/globals.css index 66b4c4fa..6222d3d8 100644 --- a/app/globals.css +++ b/app/globals.css @@ -21,3 +21,105 @@ body { color: var(--foreground); font-family: var(--font-geist-sans), Arial, Helvetica, sans-serif; } + +/* KaTeX display mode - ensure proper spacing and centering */ +.katex-display { + overflow-x: auto; + overflow-y: hidden; + padding: 1.5rem 0; + margin: 1.5rem 0; + text-align: center; +} + +/* KaTeX inline mode */ +.katex { + font-size: 1.05em; +} + +/* Prevent double-rendering by hiding any raw LaTeX that might leak through */ +.katex-html { + /* KaTeX should render, not raw LaTeX */ +} + +/* Better code block styling for audit pages */ +.prose pre { + @apply bg-slate-900 text-slate-100 rounded-lg; + padding: 1.5rem; + overflow-x: auto; + border: 1px solid #334155; +} + +.prose code { + @apply bg-slate-100 text-slate-900 px-1.5 py-0.5 rounded; + font-size: 0.9em; +} + +.prose pre code { + @apply bg-transparent text-slate-100 p-0; +} + +/* Better heading hierarchy */ +.prose h1 { + @apply text-4xl font-bold text-slate-900 mb-6 mt-12 pb-3 border-b-2 border-slate-200; +} + +.prose h2 { + @apply text-3xl font-bold text-slate-800 mb-4 mt-10; +} + +.prose h3 { + @apply text-2xl font-semibold text-slate-800 mb-3 mt-8; +} + +.prose h4 { + @apply text-xl font-semibold text-slate-700 mb-2 mt-6; +} + +/* Better list styling */ +.prose ul { + @apply list-disc pl-6 space-y-2 my-4; +} + +.prose ol { + @apply list-decimal pl-6 space-y-2 my-4; +} + +.prose li { + @apply text-slate-700 leading-relaxed; +} + +/* Better blockquote styling */ +.prose blockquote { + @apply border-l-4 border-blue-500 pl-6 italic text-slate-600 my-6; + background: linear-gradient(to right, rgba(59, 130, 246, 0.05), transparent); + padding: 1rem 1.5rem; + border-radius: 0 0.5rem 0.5rem 0; +} + +/* Table styling */ +.prose table { + @apply w-full border-collapse my-6; +} + +.prose th { + @apply bg-slate-100 font-semibold text-left px-4 py-3 border border-slate-300; +} + +.prose td { + @apply px-4 py-3 border border-slate-300; +} + +/* Links */ +.prose a { + @apply text-blue-600 hover:text-blue-800 underline decoration-blue-300 hover:decoration-blue-500 transition-colors; +} + +/* Image captions and figures */ +.prose img { + @apply rounded-lg shadow-md my-6; +} + +/* Horizontal rules */ +.prose hr { + @apply border-slate-300 my-12; +} diff --git a/app/layout.tsx b/app/layout.tsx index e5072852..97140f9f 100644 --- a/app/layout.tsx +++ b/app/layout.tsx @@ -1,7 +1,7 @@ import type { Metadata } from "next"; import { Geist, Geist_Mono } from "next/font/google"; +import "katex/dist/katex.min.css"; // Must come before globals.css to allow overrides import "./globals.css"; -import "katex/dist/katex.min.css"; const geistSans = Geist({ variable: "--font-geist-sans", diff --git a/app/textbook/audits/[...slug]/page.tsx b/app/textbook/audits/[...slug]/page.tsx index ceef3934..698839dc 100644 --- a/app/textbook/audits/[...slug]/page.tsx +++ b/app/textbook/audits/[...slug]/page.tsx @@ -9,6 +9,7 @@ import remarkGfm from "remark-gfm"; import matter from "gray-matter"; import { AuditLayout } from "@/components/audit/AuditLayout"; import { getAllChapters } from "@/lib/chapters"; +import { KatexStyles } from "@/components/KatexStyles"; interface PageProps { params: Promise<{ slug: string[] }>; @@ -89,48 +90,65 @@ export default async function AuditPage({ params }: PageProps) { // Get chapters for sidebar const chapters = getAllChapters(); + // Determine if we're in review mode based on environment + const isReviewMode = isStaging && process.env.STAGING_PR_NUMBER !== undefined; + const prNumber = process.env.STAGING_PR_NUMBER; + return ( - + + ← Back to Audits - {/* Staging Banner */} - {isStaging && ( -
-

⚠️ DRAFT AUDIT - UNDER REVIEW

-

+ {/* Staging Banner (shown when in staging but not in review mode) */} + {isStaging && !isReviewMode && ( +

+

+ + + + DRAFT AUDIT - UNDER REVIEW +

+

This is a preview of a student audit currently under review. Content may change before final publication.

)} {/* Audit Header */} -
-
+
+
{data.topic && ( - + {data.topic} )} {isStaging && ( - + DRAFT )}
-

+

{data.title || "Paper Audit"}

{data.paper && ( -

{data.paper}

+

{data.paper}

)} {data.author && ( -

- By {data.author} +

+ + + + By {data.author}

)}
@@ -141,7 +159,13 @@ export default async function AuditPage({ params }: PageProps) { options={{ mdxOptions: { remarkPlugins: [remarkMath, remarkGfm], - rehypePlugins: [rehypeKatex], + rehypePlugins: [ + [rehypeKatex, { + strict: false, // Don't fail on unknown LaTeX commands + trust: true, // Allow some advanced LaTeX features + throwOnError: false, // Gracefully handle errors + }] + ], }, }} /> diff --git a/claude.md b/claude.md new file mode 100644 index 00000000..6837d90b --- /dev/null +++ b/claude.md @@ -0,0 +1,644 @@ +# VLA Foundations Development Guide for AI SWE Agents (Private Repo) + +This is the **private instructor repository** for VLA Foundations, containing complete assignment solutions, internal grading tests, and instructor operations. The public student-facing repository is at `arpg/vla-foundations`. This repo uses **Next.js (App Router)** for the textbook, **Tailwind CSS** for styling, **MDX** for content, and **pnpm** for package management. + +Read more about the dual-repository architecture in [INSTRUCTOR.md](INSTRUCTOR.md). + +--- + +## Repository Architecture + +This is a **two-repository system**: + +``` +Private Repo (crheckman/private-vla-foundations) +├── private/ # Complete assignment solutions (NEVER PUBLIC) +│ └── solutions/ +├── tests/internal/ # Internal grading tests (NEVER PUBLIC) +│ ├── fixtures/ # Gold standard test data +│ └── reports/ # Grading reports (git-ignored) +├── scripts/ +│ ├── dev_utils.py # Solution management (inject/reset/verify-clean) +│ ├── sanitize.sh # Automated sanitization pipeline +│ └── _sanitize_todos.py # TODO comment sanitizer +├── .claude/ +│ ├── skills/ # Claude Code skills for automation +│ └── commands/ # Slash command shortcuts +└── src/assignments/ # Starter code with [SOLUTION] hints + + ↓ (Orphan push on release tag) + +Public Repo (arpg/vla-foundations) +├── src/assignments/ # Starter code (TODOs only) +├── tests/public/ # Student-visible tests +├── content/ # Textbook and assignment specs +└── [NO private/ or tests/internal/] +``` + +**Critical**: Never commit `private/` or `tests/internal/` to public branches. + +--- + +## Initial Setup + +### Prerequisites +```bash +# Install dependencies +pnpm install + +# Install uv (Python package manager) - REQUIRED +curl -LsSf https://astral.sh/uv/install.sh | sh + +# Install Python dependencies via uv +uv sync + +# Install GitHub CLI (required for skills) +brew install gh +gh auth login +``` + +### Python Environment (uv) +**All Python commands MUST use `uv run`** to ensure correct dependencies: +```bash +# Run Python scripts +uv run python scripts/dev_utils.py --list + +# Run pytest +uv run pytest tests/internal/ -v -m rigor + +# Run any Python file +uv run python src/assignments/scratch-1/generate_data.py +``` + +### Development +```bash +# Run development server +pnpm dev + +# Build production (static export in out/) +pnpm build + +# Lint Next.js +pnpm lint +``` + +--- + +## Claude Code Skills (Automation) + +This repository has **7 Claude Code skills** for workflow automation. See [.claude/skills/README.md](.claude/skills/README.md) for complete documentation. + +### Core Skills + +#### `/vla-guard` - Solution Leak Audit +**Purpose**: Prevent solution leaks before any public operation + +**Usage**: +```bash +/vla-guard +``` + +**What it does**: +- Scans for `[SOLUTION]` markers in `src/` and `content/` +- Verifies `private/` and `tests/internal/` not staged +- Checks git history for accidental commits +- Runs `dev_utils.py --verify-clean` (similarity detection) +- **Blocks** sync if any check fails + +**When to use**: Before every push, PR, or release + +--- + +#### `/test-rigor` - Internal Grading Tests +**Purpose**: Run internal grading tests with automatic solution injection/reset + +**Usage**: +```bash +/test-rigor +# Select: "Scratch-1" / "Scratch-2" / "All" +``` + +**What it does**: +1. Injects solutions: `python3 scripts/dev_utils.py --inject ` +2. Runs pytest: `pytest tests/internal/ -v -m rigor` +3. Generates report: `tests/internal/reports/test-report-.txt` +4. Resets to starter code: `python3 scripts/dev_utils.py --reset ` + +**Safe to run multiple times** - always resets after completion. + +--- + +#### `/generate-fixtures` - Gold Standard Fixtures +**Purpose**: Generate reference data for fidelity tests from solution code + +**Usage**: +```bash +/generate-fixtures +# Select assignment +``` + +**What it does**: +1. Injects solutions +2. Sets fixed random seeds (seed=42) +3. Runs solution code to generate outputs +4. Saves to `tests/internal/fixtures//gold_output.pt` +5. Verifies no NaNs +6. Generates fixture documentation +7. Resets to starter code + +**When to use**: After completing solution implementation or updating solution code + +--- + +#### `/grade` - Automated PR Grading +**Purpose**: Complete grading workflow for student pull requests + +**Usage**: +```bash +/grade +# Enter PR number or auto-detect latest +``` + +**What it does**: +1. Fetches student code from GitHub PR +2. Runs VLA Guard on student code (detect plagiarism/leaks) +3. Runs `tests/public/` (student-visible tests) +4. Injects reference solution +5. Runs `tests/internal/` (gradient leak, fidelity, training tests) +6. Restores student code +7. Generates detailed markdown feedback report +8. Posts comment on PR (optional) +9. Updates PR labels (ready-to-merge / needs-revision / changes-requested) + +**Output**: `tests/internal/reports/grade-pr.md` + +**When to use**: When reviewing student submissions + +--- + +#### `/release` - Safe Assignment Publishing +**Purpose**: Orchestrate complete release workflow with comprehensive safety checks + +**Usage**: +```bash +/release +# Select: "Scratch-1" / "Scratch-2" / etc. +``` + +**What it does**: +1. Verifies on main branch, no uncommitted changes +2. Runs `/vla-guard` pre-flight audit (fail-fast) +3. Prompts for release tag (e.g., `release-scratch-2`) +4. Shows changes since last release +5. Runs `scripts/sanitize.sh` (removes private/, [SOLUTION] markers, etc.) +6. Verifies sanitization (fail-safe) +7. Creates annotated git tag +8. Pushes tag → triggers `.github/workflows/sync-to-public.yml` +9. Monitors GitHub Actions workflow execution +10. Verifies public repository (no leaks) +11. Checks deployment status (https://www.vlm-robotics.dev) +12. Generates release summary: `.claude/releases/release-.md` + +**Fail-safe**: Aborts at ANY failed check, provides remediation instructions + +**When to use**: When ready to publish assignment to students + +--- + +#### `/new-assignment` - Assignment Scaffolding +**Purpose**: Create complete assignment structure with templates + +**Usage**: +```bash +/new-assignment +# Enter name, type, focus, difficulty +``` + +**What it does**: +1. Creates directory structure: + - `src/assignments//` (starter code with TODOs) + - `private/solutions//` (solution templates) + - `tests/public/test__basic.py` (student-visible tests) + - `tests/internal/test__rigor.py` (grading tests) + - `content/course/assignments/.mdx` (assignment spec) +2. Generates Python templates +3. Generates test templates +4. Creates README files + +**Next steps after scaffolding**: +1. Complete solution implementations +2. Run `/generate-fixtures` +3. Update MDX spec +4. Run `/test-rigor` +5. Commit changes +6. Run `/release` + +--- + +#### `/sync-check` - Post-Release Verification +**Purpose**: Verify public repository has no leaks after release sync + +**Usage**: +```bash +/sync-check +# Select: "Latest" or specify release tag +``` + +**What it does**: +1. Clones public repo to temp directory (read-only) +2. Scans for `[SOLUTION]` markers +3. Checks for private directories (`private/`, `tests/internal/`) +4. Checks for private scripts (`dev_utils.py`, `sanitize.sh`) +5. Checks for sensitive files (credentials, `*_solution.py`) +6. Verifies orphan push strategy (no linked git history) +7. Compares file lists (private vs public) +8. Checks deployment status (HTTPS 200) +9. Runs sample fidelity check +10. Generates verification report: `.claude/sync-reports/sync-check-.md` +11. Cleans up temp files + +**When to use**: Always run after `/release` completes + +**Critical**: If leaks detected, report provides urgent remediation steps + +--- + +## Commands Useful in Development + +### Solution Management +```bash +# List all available solutions +uv run python scripts/dev_utils.py --list + +# Inject solutions for testing/grading +uv run python scripts/dev_utils.py --inject scratch-1 + +# Reset to starter code +uv run python scripts/dev_utils.py --reset scratch-1 + +# Verify no solution leaks (similarity check) +uv run python scripts/dev_utils.py --verify-clean +``` + +### Testing +```bash +# Run public tests (students can see these) +uv run pytest tests/public/ -v + +# Run internal grading tests (after injecting solutions) +uv run pytest tests/internal/ -v -m rigor + +# Run specific test file +uv run pytest tests/internal/test_scratch1_rigor.py -v + +# Generate HTML report +uv run pytest tests/internal/ --html=tests/internal/reports/report.html --self-contained-html +``` + +### Pre-Release Checks +```bash +# Complete pre-flight check +/pre-flight + +# Or manually: +uv run python scripts/dev_utils.py --verify-clean +bash scripts/sanitize.sh # (Only in orphan branch workflow) +``` + +### GitHub Operations +```bash +# List open student PRs +gh pr list --base staging --state open + +# View PR details +gh pr view 123 + +# Comment on PR +gh pr comment 123 --body "Feedback here" + +# Merge PR +gh pr merge 123 --squash +``` + +--- + +## Linting and Formatting + +### Semantic Line Breaks +**All MDX files MUST use one sentence per line.** This is mandatory to allow granular, line-by-line feedback in Pull Requests. + +**Bad:** +```markdown +This is a very long sentence with multiple ideas. It continues on the same line. This makes PR review difficult. +``` + +**Good:** +```markdown +This is a sentence on its own line. +Each idea gets its own line. +This makes PR review much easier. +``` + +### LaTeX +Use formal LaTeX for all mathematical derivations: +```markdown +The loss function is: +$$ +\mathcal{L} = -\sum_{t=1}^T \log p(a_t | s_t, I_t) +$$ +``` + +Do not use code blocks for math. + +### Next.js Linting +```bash +pnpm lint +``` + +--- + +## Testing Philosophy + +### Public Tests (`tests/public/`) +**Purpose**: Student-visible validation tests + +**What they test**: +- Basic model structure (initialization, shapes) +- Forward pass correctness (no NaNs, correct dimensions) +- Gradient flow (backpropagation works) + +**Students can run these**: `pytest tests/public/test_scratch1_basic.py -v` + +### Internal Tests (`tests/internal/`) +**Purpose**: Rigorous grading tests (NEVER synced to public) + +**What they test**: +- **Gradient Leak Test**: Verify frozen parameters (e.g., DINOv2 backbone) +- **Latent Fidelity Test**: Compare output against gold standard fixtures +- **Training Convergence Test**: Verify model can train and loss decreases +- **Edge Case Tests**: Boundary conditions, error handling + +**Markers**: +- `@pytest.mark.internal` - All internal tests +- `@pytest.mark.rigor` - Strict grading tests +- `@pytest.mark.gradient` - Gradient flow tests +- `@pytest.mark.fidelity` - Output comparison tests +- `@pytest.mark.training` - Training convergence tests + +**Run with**: `pytest tests/internal/ -v -m rigor` + +--- + +## Interacting with the App + +### Local Development +```bash +pnpm dev +# Access at http://localhost:3000 +``` + +### Staging Previews +Every Pull Request to `staging` branch triggers deployment to: +``` +https://vlm-robotics.dev/staging/pulls/[PR_NUMBER]/ +``` + +**Review Protocol**: +1. Read the rendered audit on the staging site +2. Comment on the **source MDX** in GitHub "Files Changed" tab +3. Use the **Rich Diff** view in GitHub to verify LaTeX rendering + +### Production +Production site deployed at: +``` +https://www.vlm-robotics.dev +``` + +Deployment triggered by: +- Push to `main` branch (after staging → main merge) +- GitHub Action: `.github/workflows/deploy.yml` +- Deploys to ristoffer.ch via SSH + +--- + +## Patterns & Standards + +### Amazon Principle +We do not write "summaries." We write rigorous, durable **Audits**. A high-fidelity audit IS the textbook chapter. + +### Textbook Audit Sidebars +Every audit MUST contain these three technical sidebars: + +1. **The Lineage of Failure**: Why previous approaches died +2. **Intuitive Derivation**: The geometric/mathematical intuition of the loss function +3. **Implementation Gotchas**: Practitioners' notes on coordinate frames, normalization, or hyperparameters + +### The Interface Focus +When auditing VLA models, focus on the **Interface**: +- **Input Projection**: Pixels → Tokens +- **Action Head**: Tokens → Trajectories +- **The Loss/Objective Function** + +### Git Hygiene +We are a **rebase-only** lab. Use `git rebase main`. PRs containing "Merge branch 'main'" commits will be closed. + +**Correct workflow**: +```bash +git fetch origin +git rebase origin/main +git push --force-with-lease +``` + +### Sanitization +All private solutions are marked with `[SOLUTION]` tags: +```python +# TODO: Implement RMSNorm forward pass +# [SOLUTION] Use torch.rsqrt for efficiency +result = torch.rsqrt(variance + self.eps) +``` + +The sanitization pipeline: +1. `scripts/_sanitize_todos.py` - Removes `[SOLUTION]` markers +2. `scripts/sanitize.sh` - Orchestrates full cleanup (private dirs, scripts, README) +3. Triggered automatically by `.github/workflows/sync-to-public.yml` on release tags + +**Load-bearing wall**: `scripts/sanitize.sh` is the primary defense against solution leaks. + +### Orphan Push Strategy +When syncing to public repo, we use **orphan branches** to break all git history links: + +```bash +git checkout --orphan temp-public-branch +git add -A +git commit -m "Public Release: $(date)" +git push public temp-public-branch:main --force +``` + +**Benefits**: +- No commit history from private repo exposed +- Public repo has completely independent git history +- Maximum security against accidental leaks via `git log` + +--- + +## File Map of Interest + +### GitHub Actions +- [.github/workflows/sync-to-public.yml](.github/workflows/sync-to-public.yml) - Automated sync to public repo (orphan push) +- [.github/workflows/shadow-tester.yml](.github/workflows/shadow-tester.yml) - Shadow CI for student PRs +- [.github/workflows/deploy.yml](.github/workflows/deploy.yml) - Production deployment to ristoffer.ch + +### Configuration +- [next.config.ts](next.config.ts) - Next.js config with dynamic routing for staging +- [pytest.ini](pytest.ini) - pytest markers configuration +- [tailwind.config.ts](tailwind.config.ts) - Tailwind CSS configuration + +### Scripts +- [scripts/dev_utils.py](scripts/dev_utils.py) - Solution management (inject/reset/verify-clean) +- [scripts/sanitize.sh](scripts/sanitize.sh) - Complete sanitization pipeline +- [scripts/_sanitize_todos.py](scripts/_sanitize_todos.py) - TODO comment sanitizer + +### Claude Code Skills +- [.claude/skills/](/.claude/skills/) - All skill definitions +- [.claude/skills/README.md](.claude/skills/README.md) - Comprehensive skills documentation +- [.claude/commands/](.claude/commands/) - Command shortcuts + +### Components +- [components/audit/AuditLayout.tsx](components/audit/AuditLayout.tsx) - Primary wrapper for rendered textbook chapters + +### Testing +- [tests/conftest.py](tests/conftest.py) - pytest fixtures (auto-inject for internal tests) +- [tests/public/](tests/public/) - Student-visible tests +- [tests/internal/](tests/internal/) - Internal grading tests + +### Documentation +- [INSTRUCTOR.md](INSTRUCTOR.md) - Complete instructor guide (consolidated) +- [SKILLS_COMPLETE.md](SKILLS_COMPLETE.md) - Skills implementation summary +- [REFACTOR_COMPLETE.md](REFACTOR_COMPLETE.md) - Repository hardening summary + +--- + +## Typical Workflows + +### Creating a New Assignment +```bash +# 1. Scaffold structure +/new-assignment + +# 2. Implement solutions +# Edit: private/solutions/scratch-3/model_solution.py + +# 3. Generate fixtures +/generate-fixtures + +# 4. Update spec +# Edit: content/course/assignments/scratch-3.mdx + +# 5. Test grading +/test-rigor + +# 6. Commit +git add . && git commit -m "feat: add scratch-3 assignment" + +# 7. Release +/release + +# 8. Verify +/sync-check +``` + +### Grading Student Work +```bash +# 1. List PRs +gh pr list --base staging + +# 2. Grade PR +/grade + +# 3. Review report +cat tests/internal/reports/grade-pr123.md + +# 4. Merge if approved +gh pr merge 123 --squash +``` + +### Pre-Release Checklist +```bash +# 1. Audit +/vla-guard + +# 2. Pre-flight (audit + sanitize) +/pre-flight + +# 3. Release +/release + +# 4. Verify +/sync-check +``` + +--- + +## Shadow CI + +Student PRs to the public repo trigger **Shadow CI** - hidden testing with internal grading suite: + +1. Student opens PR to `arpg/vla-foundations` (public) +2. Public `.github/workflows/vla-audit.yml` triggers `repository_dispatch` to private repo +3. Private `.github/workflows/shadow-tester.yml` runs: + - Fetches student code + - Injects solutions + - Runs internal tests + - Posts Pass/Fail comment on public PR (no details) +4. Instructor uses `/grade` for detailed feedback + +**Purpose**: Catch critical failures early without exposing grading logic. + +--- + +## Security Boundaries + +### NEVER Sync to Public +- `private/` directory (complete solutions) +- `tests/internal/` directory (grading tests) +- `scripts/dev_utils.py` (solution management) +- `scripts/sanitize.sh` (sanitization script) +- `scripts/_sanitize_todos.py` (helper script) +- `.claude/` directory (instructor automation) +- Files with `[SOLUTION]` markers + +### Multi-Layer Protection +1. **Pre-commit hook** - Blocks commits with `[SOLUTION]` in public files +2. **VLA Guard skill** - Scans for leaks before operations +3. **Sanitization pipeline** - Removes private content automatically +4. **Post-sanitization validation** - Fail-safe check in GitHub Actions +5. **Orphan push** - Breaks git history links +6. **Sync-check skill** - Verifies public repo after release + +--- + +## Requirements + +- **Node.js** 18+ +- **pnpm** 8+ +- **Python** 3.11+ +- **uv** (Python package manager): `curl -LsSf https://astral.sh/uv/install.sh | sh` +- **gh CLI** (for skills): `brew install gh && gh auth login` + +Python dependencies (managed by uv via `pyproject.toml`): +- pytest, pytest-html +- torch +- numpy + +--- + +## Support + +- **Instructor Guide**: [INSTRUCTOR.md](INSTRUCTOR.md) +- **Skills Documentation**: [.claude/skills/README.md](.claude/skills/README.md) +- **Public Repo**: https://github.com/arpg/vla-foundations +- **Course Website**: https://www.vlm-robotics.dev + +--- + +**Remember**: This is the private instructor repository. Always run `/vla-guard` before any public-facing operation. diff --git a/components/KatexStyles.tsx b/components/KatexStyles.tsx new file mode 100644 index 00000000..ed641534 --- /dev/null +++ b/components/KatexStyles.tsx @@ -0,0 +1,22 @@ +'use client'; + +import { useEffect } from 'react'; + +export function KatexStyles() { + useEffect(() => { + // Ensure KaTeX CSS is loaded + const link = document.createElement('link'); + link.rel = 'stylesheet'; + link.href = 'https://cdn.jsdelivr.net/npm/katex@0.16.27/dist/katex.min.css'; + link.integrity = 'sha384-yp+jpRNKIa0xGrYaVtwImDXkFq7ZOCV5kJZVDg/uAFfYPmtFcKr0sxhVJy1HqnWD'; + link.crossOrigin = 'anonymous'; + + // Check if already loaded + const existing = document.querySelector('link[href*="katex"]'); + if (!existing) { + document.head.appendChild(link); + } + }, []); + + return null; +} diff --git a/components/audit/AuditLayout.tsx b/components/audit/AuditLayout.tsx index a44c8163..71234d86 100644 --- a/components/audit/AuditLayout.tsx +++ b/components/audit/AuditLayout.tsx @@ -13,26 +13,55 @@ interface Chapter { interface AuditLayoutProps { children: ReactNode; chapters: Chapter[]; + isReviewMode?: boolean; + prNumber?: string; } -export function AuditLayout({ children, chapters }: AuditLayoutProps) { +export function AuditLayout({ children, chapters, isReviewMode = false, prNumber }: AuditLayoutProps) { return ( -
+
-
+
+ {/* Review Mode Banner */} + {isReviewMode && ( +
+
+
+ + + + +
+
+

+ 👁️ REVIEW MODE +

+

+ You are viewing a preview of this audit. This content is under review and not yet published. +

+ {prNumber && ( +

+ Preview from PR #{prNumber} +

+ )} +
+
+
+ )} +
{children}
-
diff --git a/content/course/assignments/capstone.mdx b/content/course/assignments/capstone.mdx index 43b7f82c..a267bc61 100644 --- a/content/course/assignments/capstone.mdx +++ b/content/course/assignments/capstone.mdx @@ -1,377 +1,95 @@ --- -title: 'Capstone Project: Textbook Contribution & Implementation' +title: 'The VLA Capstone: Engineering the Frontier' assignment: 3 -due: 'Week 16' -points: 300 +due: 'Finals Week' +points: 250 --- -
-

⚠️ DRAFT: NOT YET ASSIGNED

-

This assignment is still under review and subject to change. Do not begin work until this notice is removed.

-
+# The VLA Capstone: From Audit to Architecture -# Capstone Project: Textbook Contribution & Implementation +**Weight:** 25% of Final Grade +**Initial Project Specification Due:** Leading into the Architecture Lab (This Thursday). +**Mastery Deadline:** Finals Week. -## Objective +## The Philosophy: Audit, Implement, Extend -Make a substantive contribution to the VLA Foundations textbook by authoring technical content, implementing code, and presenting your work to the class. - -## Learning Goals - -- **Synthesize** knowledge from multiple research papers -- **Implement** a non-trivial VLA component or experiment -- **Communicate** technical concepts clearly in writing -- **Present** findings to a technical audience +In this course, we do not perform "re-implementations" for the sake of practice. The Capstone is a substantive contribution to the `vlm-robotics.dev` living textbook. You are expected to move from an auditor (Assignment 2) to an architect—identifying a bottleneck, proposing a delta, and proving it via implementation. ## Project Tracks -Choose **one** of the following tracks: - -### Track 1: Research Extension - -Extend an existing VLA paper with novel experiments or analysis. - -**Requirements:** -- Reproduce baseline results from a published paper -- Design and run new experiments that test an unexplored dimension -- Contribute a textbook section analyzing your findings - -**Example Projects:** -- "Does RT-2 generalize to novel object geometries?" - Test on CAD-generated objects -- "Scaling laws for VLA data diversity" - Ablate dataset composition -- "Failure modes of diffusion policies in cluttered scenes" - Systematic failure analysis +Choose **one** of the following tracks for your technical deep-dive: -### Track 2: Engineering Implementation +### Track 1: Research Extension (The "Delta" Track) +Extend an existing VLA paper with novel experiments. +- **Requirement:** Reproduce a baseline, then design experiments testing a specific "Initial Dissolve" (e.g., "Does RT-2 generalize to novel object geometries created in simulation?"). +- **Textbook Contribution:** A section analyzing your findings and the "Information Decay" observed. +### Track 2: Engineering Implementation (The "Systems" Track) Build a production-grade VLA component from scratch. +- **Requirement:** Implement a key technique (e.g., an optimized vision encoder for 50Hz control, a cross-embodiment training harness). +- **Textbook Contribution:** A technical "Implementation Gotchas" guide and practitioners' manual for your component. -**Requirements:** -- Implement a key VLA technique (encoder, policy, training pipeline) -- Write clean, documented, tested code -- Contribute a textbook section with implementation details - -**Example Projects:** -- "Efficient vision encoder for real-time robotic control" - Optimized transformer -- "Multi-task policy training framework" - PyTorch training harness -- "Sim-to-real transfer toolkit" - Domain randomization + evaluation suite - -### Track 3: Comprehensive Survey - -Write an authoritative survey of a VLA subtopic. - -**Requirements:** -- Read 15-20 papers in a focused area -- Identify trends, gaps, and open questions -- Contribute a textbook section synthesizing the literature - -**Example Projects:** -- "Data augmentation strategies for robotic learning" - Survey + taxonomy -- "Benchmarking protocols for manipulation tasks" - Analysis of evaluation methods -- "Foundation models for embodied AI: A critical review" - Strengths/weaknesses analysis - -## Deliverables - -### 1. Proposal (Week 8) - -**Submit**: 1-2 page proposal via pull request - -**Contents:** -- Track selection (Research/Engineering/Survey) -- Problem statement and motivation -- Planned approach and timeline -- Expected contribution to textbook - -**Grading**: Pass/Fail (instructor feedback provided) - -### 2. Textbook Chapter Contribution (Week 16) - -**Submit**: MDX file with written content - -**Requirements:** -- 2000-4000 words of technical writing -- LaTeX equations for mathematical formulations -- Code snippets (if applicable) -- References to relevant papers -- Fits cohesively into one of the 8 textbook chapters - -**Location**: `content/textbook/[chapter-name]/your-section.mdx` - -**Example Structure:** +### Track 3: Synthesis & Taxonomy (The "Survey" Track) +Write an authoritative survey of a VLA sub-domain. +- **Requirement:** Read 15-20 papers. Identify the "Lineage of Failure" and the scaling laws of the sub-topic. +- **Textbook Contribution:** A foundational chapter synthesizing the literature into a cohesive taxonomy. -```mdx --- -title: "3.5 Your Section Title" -chapter: 3 -subsection: 5 -author: "Your Name" ---- - -# 3.5 Your Section Title - -## Motivation - -Why does this topic matter? - -## Background - -What do readers need to know? - -## Method - -How does it work? (Include equations) - -## Results - -What did you find? (Include figures/tables) - -## Discussion - -What are the implications? - -## References - -[Numbered references] -``` - -### 3. Code Implementation (Weeks 12-16) - -**Required for Research & Engineering tracks** (optional for Survey track) - -**Submit**: Pull request with code - -**Requirements:** -- Clean, documented Python code -- README with setup instructions -- Example usage / demo script -- Unit tests (if applicable) - -**Location**: `code/capstone/your-project-name/` - -**Grading Criteria:** -- Code quality and organization (30%) -- Documentation and comments (30%) -- Functionality and correctness (40%) -### 4. Final Presentation (Week 16) +## Technical Requirements -**Format**: 15-minute presentation + 5-minute Q&A +### 1. The Architectural Delta +Your project must identify a specific bottleneck in a "Primary Paper." You are not parrots; you are auditors. If you choose Track 1 or 2, you must justify your architectural changes using the **Amazon Principle**: write a technical specification that proves why this change is necessary. -**Contents:** -1. Problem statement and motivation (2 min) -2. Approach and methodology (5 min) -3. Results and findings (5 min) -4. Textbook contribution overview (2 min) -5. Lessons learned and future work (1 min) +### 2. The Data Mix +You must explicitly define your data curation strategy: +- **Foundational Priors:** Which internet-scale weights (SigLIP, DINOv2) are you using? +- **Embodied Data:** Which subset of Open X-Embodiment or DROID are you sampling? +- **Synthetic Multiplication:** Are you using *MimicGen* or *RoboGen* to scale your seeds? -**Slides**: Submit PDF via pull request +### 3. Formalized Logic & Derivations +Your documentation must be grounded in $\LaTeX$. +- Derive your specific loss function $\mathcal{L}_{total}$. +- Define the state-space $S$ and the action-space $A$ (e.g., Delta-EE, Joint Velocities, or Latent Tokens). -## Timeline +### 4. Semantic Form +All MDX contributions must follow the **Semantic Line Break** rule (one sentence per line). This is mandatory for the PR review process. -| Week | Milestone | -|------|-----------| -| 8 | Proposal due | -| 10 | Progress check-in (office hours) | -| 12 | Draft textbook section (optional feedback) | -| 14 | Code implementation complete | -| 16 | Final presentation + all deliverables due | - -## Grading Rubric (300 points) - -| Component | Points | -|-----------|--------| -| **Textbook Contribution** | **150** | -| - Technical accuracy | 50 | -| - Writing clarity | 40 | -| - Integration with existing chapters | 30 | -| - References and citations | 30 | -| **Implementation / Code** | **100** | -| - Functionality | 40 | -| - Code quality | 30 | -| - Documentation | 30 | -| **Presentation** | **50** | -| - Content clarity | 20 | -| - Slide quality | 15 | -| - Q&A responses | 15 | -| **Total** | **300** | - -## Evaluation Criteria - -### Textbook Contribution - -**Excellent (90-100%)**: -- Novel insights or analysis -- Crystal-clear explanations -- Publication-quality figures and equations -- Comprehensive references - -**Good (80-89%)**: -- Accurate technical content -- Clear writing with minor issues -- Relevant figures and equations -- Adequate references - -**Acceptable (70-79%)**: -- Mostly accurate content -- Understandable but needs polish -- Basic figures/equations -- Some key references missing - -### Code Implementation - -**Excellent (90-100%)**: -- Production-ready code -- Comprehensive documentation -- Runs out-of-the-box -- Includes tests and examples - -**Good (80-89%)**: -- Functional code -- Adequate documentation -- Minor setup issues -- Basic examples - -**Acceptable (70-79%)**: -- Code works with effort -- Minimal documentation -- Requires debugging -- No examples - -### Presentation - -**Excellent (90-100%)**: -- Engaging and clear -- Well-structured slides -- Confident Q&A responses -- On time - -**Good (80-89%)**: -- Clear presentation -- Decent slides -- Handles most questions -- Slightly over/under time - -**Acceptable (70-79%)**: -- Understandable content -- Basic slides -- Struggles with some questions -- Noticeable timing issues - -## Submission Process - -### Proposal (Week 8) - -```bash -git checkout -b capstone-proposal-yourname -# Add file: content/course/proposals/yourname-proposal.md -git add content/course/proposals/yourname-proposal.md -git commit -m "Add capstone proposal: Your Name" -git push origin capstone-proposal-yourname -# Open PR to staging -``` - -### Final Submission (Week 16) - -```bash -git checkout -b capstone-final-yourname -# Add textbook section: content/textbook/[chapter]/your-section.mdx -# Add code (if applicable): code/capstone/your-project/ -# Add slides: presentations/yourname-final.pdf -git add . -git commit -m "Add capstone project: Your Title" -git push origin capstone-final-yourname -# Open PR to staging -``` - -## Example Projects from Past Semesters - -### Research Track - -**"Generalization of RT-2 to Novel Objects"** (Jane Doe, 2025) -- Reproduced RT-2 baseline on Open-X dataset -- Generated 50 novel 3D objects with unseen geometries -- Found 23% performance drop on novel objects -- Contributed to Chapter 4 (Evaluation) - -**"Data Augmentation for Robotic Grasping"** (John Smith, 2025) -- Implemented 8 augmentation strategies -- Trained policies with systematic ablations -- Identified that rotation augmentation improves generalization by 15% -- Contributed to Chapter 2 (Data) - -### Engineering Track - -**"Real-Time Vision Encoder for Edge Deployment"** (Alice Johnson, 2025) -- Implemented MobileViT-based encoder -- Achieved 30 FPS on Jetson Orin -- Only 5% accuracy drop vs. ViT-B -- Contributed to Chapter 5 (Deployment) - -**"Multi-Task Policy Training Framework"** (Bob Williams, 2025) -- Built PyTorch training harness for 10+ tasks -- Supports multi-GPU, checkpointing, logging -- Open-sourced with 500+ GitHub stars -- Contributed to Chapter 3 (Training) - -### Survey Track - -**"Benchmarking Protocols for Manipulation"** (Carol Lee, 2025) -- Analyzed 30 papers on manipulation benchmarks -- Created taxonomy of evaluation metrics -- Identified reproducibility issues in 60% of papers -- Contributed to Chapter 4 (Evaluation) - -**"Foundation Models for Embodied AI: A Survey"** (David Chen, 2025) -- Surveyed 40 papers on VLMs for robotics -- Mapped landscape of architectures and datasets -- Identified key open problems -- Contributed to Chapter 7 (Future Directions) - -## Resources - -### Writing - -- [How to Write a Great Research Paper](https://www.microsoft.com/en-us/research/academic-program/write-great-research-paper/) -- [LaTeX Math Symbols](https://www.overleaf.com/learn/latex/List_of_Greek_letters_and_math_symbols) -- [MDX Documentation](https://mdxjs.com/) - -### Code - -- [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html) -- [PyTorch Best Practices](https://pytorch.org/tutorials/beginner/saving_loading_models.html) -- [Writing Good Documentation](https://www.writethedocs.org/guide/writing/beginners-guide-to-docs/) - -### Presentation - -- [How to Give a Great Research Talk](https://www.microsoft.com/en-us/research/academic-program/give-great-research-talk/) -- [Presentation Tips](https://www.cs.cmu.edu/~wloescher/presentations.html) +--- -## FAQs +## Team Structure: The $2\times$ Rule -**Q: Can I work in a team?** -A: No, capstone projects must be individual work. However, you can discuss ideas with classmates. +- **Individual Work:** The baseline for a high-quality contribution. +- **Group Work (Optional):** If you choose to work in a group, the technical bar for "Mastery" scales linearly. A 2-person team must go **$2\times$ as far**—meaning significantly larger data mixes, more robust baseline comparisons, or cross-embodiment evaluation. +- **Note:** Groups must provide a "Team Contribution Statement" in their proposal. -**Q: Can I extend my paper audit into a capstone?** -A: Yes! If you found an interesting research question during a paper audit, you can explore it further. +--- -**Q: What if my code doesn't work perfectly?** -A: Document what worked, what didn't, and why. Partial results are acceptable if well-analyzed. +## Deliverables & Grading Rubric (250 Points Total) -**Q: Can I contribute to multiple textbook chapters?** -A: Focus on one cohesive section. Quality over quantity. +### 1. Project Specification / Proposal (Pass/Fail - First Architectural Lab) +Submit via the **VLA Architecture Lab Form**. Includes team members, the "Initial Dissolve," and compute/data requirements. -**Q: What if my project scope changes?** -A: Discuss with the instructor. Pivots are allowed with justification. +### 2. Textbook Chapter Contribution (100 Points) +- **Technical Accuracy & Rigor (50 pts):** Correct $\LaTeX$, sound mathematical derivations, and deep critique. +- **Writing & Insights (50 pts):** Must include *Lineage of Failure*, *Intuitive Derivations*, and *Implementation Gotchas*. -## Getting Help +### 3. Code Implementation (75 Points) +- **Functionality & Correctness (50 pts):** Does it solve the stated bottleneck? +- **Code Quality & Docs (25 pts):** Clean Python, README with setup, and unit tests. -- **Office Hours**: Every Tuesday/Thursday 3-4 PM -- **Discussion Forum**: Post questions and get peer feedback -- **Mid-Project Check-In**: Schedule a meeting in Week 12 +### 4. Final Presentation (75 Points) +- **Content Density (50 pts):** 15-minute technical brief. +- **Q&A Rigor (25 pts):** Ability to defend your load-bearing assertions. -## Final Notes +--- -The capstone is your opportunity to make a lasting contribution to the VLA research community. Past student projects have been cited in papers, used by other researchers, and featured in the textbook for future cohorts. +## Submission Process: The PR Workflow -**Aim for work you'd be proud to showcase in a job interview or PhD application.** +1. **Branching:** `git checkout -b project/your-handle-topic` +2. **Pathing:** - **Textbook:** `content/textbook/[chapter]/your-section.mdx` + - **Code:** `code/capstone/your-project/` + - **Slides:** `presentations/your-name-final.pdf` +3. **The Loop:** Open a PR to `staging`. A bot will provide a preview link. Iterate until your project reaches **Level 3 (Mastery)** and is merged into the `main` textbook. -Good luck! +> **Final Note:** The capstone is your opportunity to make a lasting contribution to the VLA research community. Aim for work you would be proud to showcase in an AI Engineering interview. diff --git a/content/course/assignments/scratch-1.mdx b/content/course/assignments/scratch-1.mdx index 2b1f2bce..cdb5636d 100644 --- a/content/course/assignments/scratch-1.mdx +++ b/content/course/assignments/scratch-1.mdx @@ -5,11 +5,6 @@ due: 'Sunday, February 1, 9:00 AM MST' points: 100 --- -
-

⚠️ DRAFT: NOT YET ASSIGNED

-

This assignment is still under review and subject to change. Do not begin work until this notice is removed.

-
- # Scratch-1: The Transformer Backbone **Focus**: Implementing the $O(1)$ engine of the VLA stack. @@ -187,7 +182,9 @@ When I removed the causal mask, the following happened: ### Pass Level (B): 70-89 points - ✅ Successful implementation of the backbone -- ✅ Loss converges on the synthetic dataset (< 1.0) +- ✅ Loss shows clear convergence (appreciable decrease from initial loss) + - Expected: Initial loss ~3-4, Final loss ~1.9-2.2 + - Model should demonstrate learning, not achieve arbitrary threshold - ✅ Attention maps visualization included - ✅ Causal mask audit completed - ✅ Code is clean and documented @@ -307,6 +304,13 @@ A: Check: 2. Is the learning rate too high? (Try 1e-4) 3. Are gradients exploding? (Enable gradient clipping) +**Q: What loss should I expect?** +A: With correct implementation on the synthetic trajectory dataset: +- **Initial loss**: ~3-4 (near random guessing for 256-way classification) +- **Final loss**: ~1.9-2.2 (showing clear learning) +- **Key metric**: Appreciable decrease indicating the model learns patterns +- The action encoding represents direction + magnitude toward target, which is learnable but not trivial + ## 11. Deadline **Due**: Sunday, February 1, 9:00 AM MST diff --git a/content/course/submissions/scratch-1/Zaaler.mdx b/content/course/submissions/scratch-1/Zaaler.mdx new file mode 100644 index 00000000..3114471e --- /dev/null +++ b/content/course/submissions/scratch-1/Zaaler.mdx @@ -0,0 +1,123 @@ +--- +title: "Scratch-1 Submission: Zaaler" +student: "Zack Allen" +date: "2026-01-21" +--- + +# Scratch-1: The Transformer Backbone + +## Loss Curve + +Train and Validation Loss Curves with Best Model Shown + +The model converged after 5358 iterations after 19 epochs with final loss of 1.9523. This was determined when the best model performed better than the last 5 models on the validation set. The best model is marked by the green star. + +## Attention Visualization + +The following trajectory is the one that corresponds with the produced attention maps. + +Example Trajectory + +I thought it would be important to understand the trajectory and what makes it unique when looking at the attention heads and layers. + +Here is the attention maps for all 8 heads at layer 0 of the model. + +All Attention Heads within Layer 0 + +This figures show the different aspects that each attention head was keying in on. The average of all these attention heads is shown below. + +All Attention Heads within Layer 0 + +This shows the summation of all the individual heads. Overall, this first layer focuses on information primarily in the 5 to 10 timestamps following the current state (joints, position) in the trajectory. + +All Attention Heads within Layer 1 + +This shows the summation of all the individual heads on layer 2 of the model. Overall, this second layer shows that as we move through the state, we consider things more gradually through longer amounts of the previous information. + +All Attention Heads within Layer 2 + +This shows the summation of all the individual heads on layer 3 of the model. Similarly, this third layer shows that as we move through the state, we consider things more gradually through longer amounts of the previous information with slightly more importance on the 5 most recent timestamps. + +All Attention Heads within Layer 3 + +This shows the summation of all the individual heads on layer 4 of the model. The fourth layer centers its attention on the time stamps following the current timestamp. Trying to recover information from the time directly proceding the current state. + +## The Audit: Removing the Causal Mask + +When I removed the causal mask, the following happened: + +The validation loss was able to reach much lower, all the way down to 0.0573. + +Train and Validation Loss Curves with Best Model Shown + +The attention maps show that the layers are finding correlations between the current state and future states that haven't occurred yet. The model is learning is gathering information from future events that haven't occurred yet. + +All Attention Heads within Layer 0 + +### Why the Model "Cheats" + +The model cheats because it can now see all the future states of the system. Therefore, it can drastically reduce its learning loss by learning the patterns in the trajectories. Essentially, it easily predicts the best next step because it knows where the trajectory was headed in the following timestamps. Therefore, it never learned the actual task, it just learned how to predict when it could copy from the future. + +## Code Highlights + +No special implementation highlights outside of some additional debugging stuff I did to visualize data better. Can add it by changing debug_info to true at top of backbone.py. + +## Challenges and Solutions + +Failed KV Caching Implementation and Inference Speed Comparison + +Attempt is logged in git history and commits but removed from final PR. + +This attempt is likely incorrect. I tried to explain the produced plots. I was able to learn about KV caching and see in principle how it would have been valuable to reduce inference time. + +Comparison of Nominal versus KV Cache Inference Speed + +The figure above shows the difference in inference times with and without KV Caching. Here we see that preventing the recomputation of all previous timestamps K and V values and simply requiring a single computation of the current timestamps K and V values is incredibly advantageous. The furthest left plot shows inference time versus generation length with a fixed prompt of 5 tokens. The plot shows basically no speed increase for trajectory gernation lengths up to 50 points. The middle plot talks about inference time versus prompt length. Since our trajcetories were only 50 in length, I constrained them here. You can see that the amount of time to predict the remaining trajectory decreases as prompt length increases which makes sense (more provided, less to predict). The final plot shows the combination of computation gains based on prompt length and generation length. + +## References + +- [RMSNorm: Root Mean Square Layer Normalization](https://arxiv.org/abs/1910.07467) - Zhang & Sennrich, 2019 +- [RMSNorm Implementation](https://github.com/bzhangGo/rmsnorm) - Reference implementation +- [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) - Su et al., 2021 +- [Rotary Embeddings: A Relative Revolution](https://blog.eleuther.ai/rotary-embeddings/) - EleutherAI +- [Let's build GPT: from scratch, in code, spelled out](https://www.youtube.com/watch?v=kCc8FmEb1nY) - Andrej Karpathy \ No newline at end of file diff --git a/content/course/submissions/scratch-1/images/causal_mask_removed_loss_curves.png b/content/course/submissions/scratch-1/images/causal_mask_removed_loss_curves.png new file mode 100644 index 00000000..514e61d7 Binary files /dev/null and b/content/course/submissions/scratch-1/images/causal_mask_removed_loss_curves.png differ diff --git a/content/course/submissions/scratch-1/images/kv_cache_benchmark.png b/content/course/submissions/scratch-1/images/kv_cache_benchmark.png new file mode 100644 index 00000000..b14b08ae Binary files /dev/null and b/content/course/submissions/scratch-1/images/kv_cache_benchmark.png differ diff --git a/content/course/submissions/scratch-1/images/loss_curves.png b/content/course/submissions/scratch-1/images/loss_curves.png new file mode 100644 index 00000000..ace4e292 Binary files /dev/null and b/content/course/submissions/scratch-1/images/loss_curves.png differ diff --git a/content/course/submissions/scratch-1/images/mask_removed_trajectory_0_all_heads_layer_0.png b/content/course/submissions/scratch-1/images/mask_removed_trajectory_0_all_heads_layer_0.png new file mode 100644 index 00000000..324f1483 Binary files /dev/null and b/content/course/submissions/scratch-1/images/mask_removed_trajectory_0_all_heads_layer_0.png differ diff --git a/content/course/submissions/scratch-1/images/trajectory_0_all_heads_layer_0.png b/content/course/submissions/scratch-1/images/trajectory_0_all_heads_layer_0.png new file mode 100644 index 00000000..0d967907 Binary files /dev/null and b/content/course/submissions/scratch-1/images/trajectory_0_all_heads_layer_0.png differ diff --git a/content/course/submissions/scratch-1/images/trajectory_0_attention_layer_0.png b/content/course/submissions/scratch-1/images/trajectory_0_attention_layer_0.png new file mode 100644 index 00000000..e587da4d Binary files /dev/null and b/content/course/submissions/scratch-1/images/trajectory_0_attention_layer_0.png differ diff --git a/content/course/submissions/scratch-1/images/trajectory_0_attention_layer_1.png b/content/course/submissions/scratch-1/images/trajectory_0_attention_layer_1.png new file mode 100644 index 00000000..106b31c6 Binary files /dev/null and b/content/course/submissions/scratch-1/images/trajectory_0_attention_layer_1.png differ diff --git a/content/course/submissions/scratch-1/images/trajectory_0_attention_layer_2.png b/content/course/submissions/scratch-1/images/trajectory_0_attention_layer_2.png new file mode 100644 index 00000000..ef8bf15f Binary files /dev/null and b/content/course/submissions/scratch-1/images/trajectory_0_attention_layer_2.png differ diff --git a/content/course/submissions/scratch-1/images/trajectory_0_attention_layer_3.png b/content/course/submissions/scratch-1/images/trajectory_0_attention_layer_3.png new file mode 100644 index 00000000..ef0f93ec Binary files /dev/null and b/content/course/submissions/scratch-1/images/trajectory_0_attention_layer_3.png differ diff --git a/content/course/submissions/scratch-1/images/trajectory_0_end_effector.png b/content/course/submissions/scratch-1/images/trajectory_0_end_effector.png new file mode 100644 index 00000000..e44e139f Binary files /dev/null and b/content/course/submissions/scratch-1/images/trajectory_0_end_effector.png differ diff --git a/data/trajectories.pkl b/data/trajectories.pkl new file mode 100644 index 00000000..d5b55493 Binary files /dev/null and b/data/trajectories.pkl differ diff --git a/grading_reports/GRADING_REPORT.md b/grading_reports/GRADING_REPORT.md new file mode 100644 index 00000000..e04aa476 --- /dev/null +++ b/grading_reports/GRADING_REPORT.md @@ -0,0 +1,65 @@ +![Chris-Bot](~/chris_robot.png) +### 🤖 Chris's Grading Assistant - Feedback Report + +**Student:** @Zaaler +**PR:** #53 +**Branch:** `scratch-1-Zaaler` + +Hi! I've reviewed your submission. Here's what I found: + +--- + +## 📊 Component Feedback + +### ✅ Causal Self-Attention + +✅ Perfect! Your causal mask correctly prevents future token leakage. + +✅ Test passed. + +### ✅ RMSNorm + +✅ RMSNorm implemented correctly with proper normalization and learnable scale. + +✅ Test passed. + +### ✅ Training Loop + +✅ Excellent! Your model trains successfully and loss converges. + +### ✅ RoPE Embeddings + +✅ RoPE correctly applied to Q and K tensors. + +### ✅ Model Architecture + +✅ Model forward pass works end-to-end with correct output shapes. + +✅ Model has the expected number of trainable parameters. + +### ✅ Code Quality + +Your code imports and runs cleanly. Nice! ✨ + +--- + +## 📝 Documentation & Analysis + +✅ Report submitted! I found: +- `content/course/submissions/scratch-1/Zaaler.mdx` +- `README.md` + +Your instructor will review the quality of your analysis. + +--- + +## 🎯 Mastery Features Detected + +I noticed you implemented: +- KV-Caching implementation + +Great work going beyond the requirements! Your instructor will verify implementation quality. + +--- + +> *Grading is automated but reviewed by an instructor. If you have questions, reach out on Slack!* diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..19a06a87 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,54 @@ +[project] +name = "vla-foundations" +version = "0.1.0" +description = "VLA Foundations Course - Private Instructor Repository" +readme = "README.md" +requires-python = ">=3.10,<3.14" +dependencies = [ + "torch>=2.0.0", + "torchvision", + "numpy>=1.24.0", + "pytest>=7.0.0", + "pytest-html>=4.0.0", + "matplotlib>=3.5.0", +] + +[[tool.uv.index]] +name = "pytorch-cpu" +url = "https://download.pytorch.org/whl/cpu" +explicit = true + +[[tool.uv.index]] +name = "pytorch-cu118" +url = "https://download.pytorch.org/whl/cu118" +explicit = true + +[tool.uv.sources] +torch = [ + { index = "pytorch-cpu", marker = "sys_platform == 'darwin'" }, + { index = "pytorch-cu118", marker = "sys_platform == 'linux'" } +] +torchvision = [ + { index = "pytorch-cpu", marker = "sys_platform == 'darwin'" }, + { index = "pytorch-cu118", marker = "sys_platform == 'linux'" } +] + +[tool.hatch.build.targets.wheel] +packages = [] + +[tool.pytest.ini_options] +markers = [ + "internal: internal grading tests (never public)", + "rigor: rigorous grading tests", + "gradient: gradient flow tests", + "fidelity: output comparison tests", + "training: training convergence tests", + "mastery: optional mastery-level features (DINOv2, KV-cache, etc.)", +] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] + +[dependency-groups] +dev = [] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..ea7a96da --- /dev/null +++ b/pytest.ini @@ -0,0 +1,13 @@ +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +markers = + public: Tests that students can see and run + internal: Internal grading tests (never public) + rigor: Rigorous validation tests for grading + gradient: Tests for gradient flow validation + fidelity: Tests for output quality validation + training: Tests for training convergence + mastery: Optional mastery-level features (DINOv2, KV-cache, etc.) diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 00000000..0e328843 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,49 @@ +# CI/CD Scripts + +**Critical infrastructure scripts** used in GitHub Actions workflows. + +## Contents + +### Production Scripts + +- **`manage_solutions.py`** - Inject/reset assignment solutions (used in testing) +- **`sanitize.sh`** - Main sanitization pipeline for public sync +- **`_sanitize_todos.py`** - Remove solution hints from code +- **`audit_linter.py`** - Validate paper audit MDX files + +### Usage in CI/CD + +| Script | Workflow | Purpose | +|--------|----------|---------| +| `audit_linter.py` | `vla-audit.yml` | Validate audit frontmatter | +| `sanitize.sh` | `sync-to-public.yml` | Remove private content | +| `_sanitize_todos.py` | `sync-to-public.yml` | Strip solution hints | +| `manage_solutions.py` | (local testing) | Inject/reset solutions | + +### Critical Requirements + +1. **Fail-Safe**: All scripts must return non-zero exit codes on failure +2. **Idempotent**: Can be run multiple times safely +3. **Validated**: Must pass linting before sync +4. **Documented**: Clear error messages and usage + +## Development Scripts + +Local development helpers are in `scripts/dev/`. These are **not** used in CI/CD. + +## Modification Guidelines + +Changes to scripts in this directory affect production workflows. Always: + +1. Test locally first +2. Verify exit codes +3. Check GitHub Actions logs +4. Update documentation + +## Security + +These scripts handle sensitive operations: +- `sanitize.sh` - Removes private content before public sync +- `manage_solutions.py` - Manages private solutions + +Never commit secrets or tokens to these scripts. diff --git a/scripts/audit_linter.py b/scripts/audit_linter.py index 8ecfa1b8..ff1952a6 100755 --- a/scripts/audit_linter.py +++ b/scripts/audit_linter.py @@ -32,6 +32,62 @@ def check_semantic_breaks(file_path): ) return errors +def validate_frontmatter(file_path, content, lines): + """Validate YAML frontmatter contains required fields.""" + errors = [] + + # Extract frontmatter + if not content.startswith('---'): + errors.append( + f"{file_path}: Missing YAML frontmatter. File must start with '---' followed by " + "title, author, paper, and topic fields." + ) + return errors + + # Find the end of frontmatter + frontmatter_end = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == '---': + frontmatter_end = i + break + + if frontmatter_end is None: + errors.append( + f"{file_path}: Malformed YAML frontmatter. Missing closing '---'." + ) + return errors + + frontmatter_lines = lines[1:frontmatter_end] + frontmatter_text = '\n'.join(frontmatter_lines) + + # Required fields for audit MDX files + required_fields = ['title', 'author', 'topic', 'paper'] + + for field in required_fields: + # Check if field exists (case-insensitive) + if not any(line.strip().lower().startswith(f'{field}:') for line in frontmatter_lines): + errors.append( + f"{file_path}: Missing required frontmatter field: '{field}'" + ) + + # Validate field values are not empty + for line in frontmatter_lines: + stripped = line.strip() + if ':' in stripped: + field_name, field_value = stripped.split(':', 1) + field_name = field_name.strip().lower() + field_value = field_value.strip() + + if field_name in required_fields: + # Check for empty values or placeholder values + if not field_value or field_value in ['""', "''", 'null', 'TBD', 'TODO']: + errors.append( + f"{file_path}: Empty or placeholder value for required field: '{field_name}'" + ) + + return errors + + def check_mdx_syntax(file_path): """Check for MDX-specific syntax issues.""" with open(file_path, 'r', encoding='utf-8') as f: @@ -40,12 +96,8 @@ def check_mdx_syntax(file_path): errors = [] - # Check 1: Must have YAML frontmatter at the start - if not content.startswith('---'): - errors.append( - f"{file_path}: Missing YAML frontmatter. File must start with '---' followed by " - "title, author, paper, and topic fields." - ) + # Check 1: Validate frontmatter fields + errors.extend(validate_frontmatter(file_path, content, lines)) # Check 2: No HTML comments (should use JSX-style {/* */}) if '