diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml index f43437d19c0..fed1792d2f9 100644 --- a/.github/copy-pr-bot.yaml +++ b/.github/copy-pr-bot.yaml @@ -1,4 +1,4 @@ enabled: true auto_sync_draft: false auto_sync_ready: true -trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "parthmannan", "prajwal1210", "pthombre", "rogerwaleffe", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"] +trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "mkhona-nvidia", "parthmannan", "prajwal1210", "pthombre", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"] diff --git a/.github/oncall_schedule.json b/.github/oncall_schedule.json index 5fa49e966bc..58fcf0ddbbc 100644 --- a/.github/oncall_schedule.json +++ b/.github/oncall_schedule.json @@ -1,18 +1,6 @@ [ - { - "user": "dimapihtar", - "date": "2026-01-28" - }, - { - "user": "gautham-kollu", - "date": "2026-02-04" - }, { "user": "janEbert", - "date": "2026-02-11" - }, - { - "user": "Phlip79", "date": "2026-02-18" }, { @@ -46,5 +34,17 @@ { "user": "BoxiangW", "date": "2026-04-15" + }, + { + "user": "Phlip79", + "date": "2026-04-22" + }, + { + "user": "asolergi-nv", + "date": "2026-04-29" + }, + { + "user": "dimapihtar", + "date": "2026-05-06" } ] diff --git a/.github/scripts/readme.sh b/.github/scripts/readme.sh new file mode 100644 index 00000000000..216d5224a28 --- /dev/null +++ b/.github/scripts/readme.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +cat << 'EOF' +╔══════════════════════════════════════════════════════════════════════╗ +║ ║ +║ ███╗ ███╗██████╗ ██████╗ ██╗██████╗ ██████╗ ███████╗ ║ +║ ████╗ ████║██╔══██╗██╔══██╗██║██╔══██╗██╔════╝ ██╔════╝ ║ +║ ██╔████╔██║██████╔╝██████╔╝██║██║ ██║██║ ███╗█████╗ ║ +║ ██║╚██╔╝██║██╔══██╗██╔══██╗██║██║ ██║██║ ██║██╔══╝ ║ +║ ██║ ╚═╝ ██║██████╔╝██║ ██║██║██████╔╝╚██████╔╝███████╗ ║ +║ ╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝╚═╝╚═════╝ ╚═════╝ ╚══════╝ ║ +║ ║ +║ H O W T O : M B R I D G E T E S T I N G ║ +╚══════════════════════════════════════════════════════════════════════╝ + + MBridge unit tests run automatically on every PR. To also trigger + functional tests, attach the label and re-run the workflow step. + + ┌─────────────────────────────────────────────────────────────────┐ + │ DEFAULT │ Unit tests run on every PR (no action needed) │ + ├─────────────────────────────────────────────────────────────────┤ + │ │ + │ Every PR ──► cicd-mbridge-testing ──► unit tests only │ + │ │ + └─────────────────────────────────────────────────────────────────┘ + + ┌─────────────────────────────────────────────────────────────────┐ + │ STEP 1 │ Attach the label to your PR (for functional tests) │ + ├─────────────────────────────────────────────────────────────────┤ + │ │ + │ PR Labels ──► [ + Add label ] ──► "Run MBridge tests" │ + │ │ + └─────────────────────────────────────────────────────────────────┘ + + ┌─────────────────────────────────────────────────────────────────┐ + │ STEP 2 │ Re-run this workflow step │ + ├─────────────────────────────────────────────────────────────────┤ + │ │ + │ Actions ──► [ Re-run jobs ] ──► Re-run failed jobs │ + │ │ + └─────────────────────────────────────────────────────────────────┘ + + ┌─────────────────────────────────────────────────────────────────┐ + │ RESULT │ Unit + functional tests run! │ + ├─────────────────────────────────────────────────────────────────┤ + │ │ + │ cicd-mbridge-testing ◄── unit + functional tests │ + │ │ + │ Tests run against MBridge using the merge commit │ + │ SHA of your pull request. │ + │ │ + └─────────────────────────────────────────────────────────────────┘ + + ┌────────────────────────────────────┐ + │ Label present? NO → unit │ + │ Label present? YES → unit + │ + │ functional│ + └────────────────────────────────────┘ + + NOTE: The label must be present BEFORE the re-run is triggered. + The CI checks for "Run MBridge tests" at runtime. + + NOTE: All MBridge test results are optional — failures do not + block merging your PR. +EOF diff --git a/.github/scripts/sync_team_usergroups.py b/.github/scripts/sync_team_usergroups.py index 429387fc6de..c3fa5d474ff 100644 --- a/.github/scripts/sync_team_usergroups.py +++ b/.github/scripts/sync_team_usergroups.py @@ -29,7 +29,12 @@ # Constants GITHUB_API_URL = "https://api.github.com" -PARENT_TEAM_SLUG = "mcore-reviewers" + +# Teams whose *children* are each synced to their own Slack usergroup +PARENT_TEAM_SLUGS = ["mcore-reviewers"] + +# Teams synced directly (the team itself, not its children) +DIRECT_TEAM_SLUGS = ["mcore-engineers"] # Caches for email and Slack lookups _email_cache = {} @@ -83,6 +88,8 @@ def github_team_to_slack_usergroup(team_slug): name = name[5:] # Remove "core-" elif name.startswith("megatron-"): name = name[9:] # Remove "megatron-" + elif name.startswith("mcore-"): + name = name[6:] # Remove "mcore-" # Remove "-and-" name = name.replace("-and-", "-") @@ -437,13 +444,13 @@ def sync_team_to_usergroup(team_slug, usergroup_handle, dry_run=False): return False -def get_team_to_usergroup_mapping(): - """Fetch child teams of mcore-reviewers and generate the mapping.""" +def get_team_to_usergroup_mapping(parent_team_slug): + """Fetch child teams of a parent team and generate the mapping.""" org = get_org() - child_teams = get_child_teams(org, PARENT_TEAM_SLUG) + child_teams = get_child_teams(org, parent_team_slug) if not child_teams: - print(f"Error: No child teams found under '{PARENT_TEAM_SLUG}'") + print(f"Error: No child teams found under '{parent_team_slug}'") return {} mapping = {} @@ -454,10 +461,30 @@ def get_team_to_usergroup_mapping(): return mapping -def sync_all_teams(dry_run=False): - """Sync all GitHub teams under mcore-reviewers to their Slack usergroups.""" - print(f"Fetching child teams of '{PARENT_TEAM_SLUG}'...") - team_to_usergroup = get_team_to_usergroup_mapping() +def sync_all_teams(dry_run=False, parent_teams=None, direct_teams=None): + """Sync GitHub teams to their Slack usergroups. + + Args: + parent_teams: List of team slugs whose *children* are each synced. + Defaults to PARENT_TEAM_SLUGS. + direct_teams: List of team slugs synced directly (not their children). + Defaults to DIRECT_TEAM_SLUGS. + """ + if parent_teams is None: + parent_teams = PARENT_TEAM_SLUGS + if direct_teams is None: + direct_teams = DIRECT_TEAM_SLUGS + + team_to_usergroup = {} + + for parent_slug in parent_teams: + print(f"Fetching child teams of '{parent_slug}'...") + mapping = get_team_to_usergroup_mapping(parent_slug) + team_to_usergroup.update(mapping) + + for team_slug in direct_teams: + usergroup_handle = github_team_to_slack_usergroup(team_slug) + team_to_usergroup[team_slug] = usergroup_handle if not team_to_usergroup: return False @@ -504,12 +531,40 @@ def main(): action="store_true", help="List all configured team-to-usergroup mappings", ) + parser.add_argument( + "--parent-team", + action="append", + dest="parent_teams", + metavar="SLUG", + help=( + "Sync all children of this GitHub team (can be repeated). " + f"Defaults to: {PARENT_TEAM_SLUGS}" + ), + ) + parser.add_argument( + "--team", + action="append", + dest="direct_teams", + metavar="SLUG", + help=( + "Sync this GitHub team directly (can be repeated). " + f"Defaults to: {DIRECT_TEAM_SLUGS}" + ), + ) args = parser.parse_args() + # Use CLI values when provided, otherwise fall back to module-level defaults + parent_teams = args.parent_teams if args.parent_teams is not None else PARENT_TEAM_SLUGS + direct_teams = args.direct_teams if args.direct_teams is not None else DIRECT_TEAM_SLUGS + if args.list: - print(f"Fetching child teams of '{PARENT_TEAM_SLUG}'...") - team_to_usergroup = get_team_to_usergroup_mapping() + team_to_usergroup = {} + for parent_slug in parent_teams: + print(f"Fetching child teams of '{parent_slug}'...") + team_to_usergroup.update(get_team_to_usergroup_mapping(parent_slug)) + for team_slug in direct_teams: + team_to_usergroup[team_slug] = github_team_to_slack_usergroup(team_slug) if not team_to_usergroup: sys.exit(1) print("\nTeam-to-usergroup mappings:") @@ -519,7 +574,9 @@ def main(): print(f"{team:<35} @{usergroup:<29}") return - success = sync_all_teams(dry_run=args.dry_run) + success = sync_all_teams( + dry_run=args.dry_run, parent_teams=parent_teams, direct_teams=direct_teams + ) sys.exit(0 if success else 1) diff --git a/.github/workflows/_build_test_publish_wheel.yml b/.github/workflows/_build_test_publish_wheel.yml index 9e9062827de..0b71577b587 100644 --- a/.github/workflows/_build_test_publish_wheel.yml +++ b/.github/workflows/_build_test_publish_wheel.yml @@ -17,8 +17,6 @@ on: type: boolean default: true secrets: - TWINE_USERNAME: - required: true TWINE_PASSWORD: required: true @@ -147,7 +145,6 @@ jobs: needs: [build-and-test-wheels] runs-on: ubuntu-latest if: inputs.no-publish == false - environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && 'main' || 'public' }} strategy: fail-fast: false matrix: @@ -170,7 +167,7 @@ jobs: - name: Publish wheels env: - TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} + TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} TWINE_REPOSITORY: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && 'pypi' || 'testpypi' }} PLATFORM: ${{ matrix.PLATFORM }} diff --git a/.github/workflows/_release_library.yml b/.github/workflows/_release_library.yml index d39ee505c2a..684dacc27aa 100644 --- a/.github/workflows/_release_library.yml +++ b/.github/workflows/_release_library.yml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: 'Release' +name: "Release" defaults: run: @@ -38,13 +38,24 @@ on: description: Create a GitHub release type: boolean default: true + gh-release-use-changelog-builder: + required: false + description: Use release-changelog-builder-action to dynamically build changelog + type: boolean + default: true + gh-release-changelog-config: + required: false + description: Path to changelog builder configuration file + type: string + default: ".github/workflows/config/changelog-config.json" + gh-release-from-tag: + required: false + description: Starting tag for changelog builder (leave empty for auto-detect) + type: string + default: "" secrets: - TWINE_USERNAME: - required: true TWINE_PASSWORD: required: true - SLACK_WEBHOOK_ADMIN: - required: true SLACK_WEBHOOK: required: true PAT: @@ -62,12 +73,10 @@ jobs: ref: ${{ inputs.release-ref }} no-publish: true secrets: - TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} bump-next-version: runs-on: ubuntu-latest - environment: main # ${{ inputs.dry-run == true && 'public' || 'main' }} needs: build-test-publish-wheels-dry-run if: | ( @@ -90,8 +99,8 @@ jobs: - name: Bump version MCore id: bump-version-mcore env: - SRC_DIR: '' - PYPROJECT_NAME: 'megatron.core' + SRC_DIR: "" + PYPROJECT_NAME: "megatron.core" run: | set +u cd ${{ github.run_id }} @@ -129,8 +138,8 @@ jobs: - name: Bump version MFSDP id: bump-version-mfsdp env: - SRC_DIR: 'megatron/core/distributed/fsdp/src/' - PYPROJECT_NAME: 'megatron_fsdp' + SRC_DIR: "megatron/core/distributed/fsdp/src/" + PYPROJECT_NAME: "megatron_fsdp" run: | set +u @@ -323,7 +332,6 @@ jobs: create-gh-release: needs: [build-test-publish-wheels, bump-next-version] runs-on: ubuntu-latest - environment: ${{ inputs.dry-run == true && 'public' || 'main' }} if: | ( success() || !failure() @@ -345,12 +353,51 @@ jobs: ref: ${{ inputs.release-ref }} token: ${{ secrets.PAT || secrets.GITHUB_TOKEN }} + - name: Determine fromTag for changelog + id: determine-from-tag + if: inputs.gh-release-use-changelog-builder == true + run: | + cd ${{ github.run_id }} + + # If gh-release-from-tag is provided, use it + if [[ -n "${{ inputs.gh-release-from-tag }}" ]]; then + FROM_TAG="${{ inputs.gh-release-from-tag }}" + echo "Using provided fromTag: $FROM_TAG" + else + # Get the most recent tag + FROM_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "") + if [[ -z "$FROM_TAG" ]]; then + echo "No previous tags found, leaving fromTag empty" + else + echo "Auto-detected most recent tag: $FROM_TAG" + fi + fi + + echo "from-tag=$FROM_TAG" >> $GITHUB_OUTPUT + + - name: Build Changelog + id: build-changelog + if: inputs.gh-release-use-changelog-builder == true + uses: mikepenz/release-changelog-builder-action@v6.1.0 + env: + GITHUB_TOKEN: ${{ secrets.PAT || secrets.GITHUB_TOKEN }} + with: + configuration: ${{ github.run_id }}/${{ inputs.gh-release-changelog-config }} + owner: ${{ github.repository_owner }} + repo: ${{ github.event.repository.name }} + ignorePreReleases: "false" + failOnError: "false" + fromTag: ${{ steps.determine-from-tag.outputs.from-tag }} + toTag: ${{ inputs.release-ref }} + mode: ${{ inputs.gh-release-changelog-mode }} + - name: Create release id: version-number env: SHA: ${{ inputs.release-ref }} GH_TOKEN: ${{ secrets.PAT }} IS_DRY_RUN: ${{ inputs.dry-run }} + BUILT_CHANGELOG: ${{ steps.build-changelog.outputs.changelog }} run: | cd ${{ github.run_id }} @@ -359,7 +406,10 @@ jobs: IS_PRERELEASE=$([[ "$IS_RELEASE_CANDIDATE" == "true" || "$IS_ALPHA" == "true" ]] && echo "true" || echo "false") NAME="NVIDIA $PROJECT_NAME ${VERSION}" - if [[ "$IS_RELEASE_CANDIDATE" == "true" ]]; then + # Use built changelog if available, otherwise fall back to CHANGELOG.md + if [[ -n "$BUILT_CHANGELOG" ]]; then + CHANGELOG="$BUILT_CHANGELOG" + elif [[ "$IS_RELEASE_CANDIDATE" == "true" ]]; then DATE=$(date +"%Y-%m-%d") CHANGELOG="Prerelease: $NAME ($DATE)" else @@ -402,10 +452,19 @@ jobs: eval "$CMD" fi + publish-docs: + needs: [bump-next-version, create-gh-release] + uses: ./.github/workflows/release-docs.yml + with: + dry-run: ${{ inputs.dry-run }} + publish-as-latest: true + docs-version-override: ${{ needs.bump-next-version.outputs.release-version }} + build-docs-ref: ${{ inputs.release-ref }} + secrets: inherit + notify: needs: [build-test-publish-wheels, create-gh-release] runs-on: ubuntu-latest - environment: ${{ inputs.dry-run == true && 'public' || 'main' }} env: GH_URL: https://github.com/${{ github.repository }}/releases/tag/v${{ needs.build-test-publish-wheels.outputs.version }} PYPI_URL: https://${{ inputs.dry-run == true && 'test.' || '' }}pypi.org/project/${{ needs.build-test-publish-wheels.outputs.pypi-name }}/${{ needs.build-test-publish-wheels.outputs.version }}/ diff --git a/.github/workflows/_update_dependencies.yml b/.github/workflows/_update_dependencies.yml index 063b966b5de..a60e69f701b 100644 --- a/.github/workflows/_update_dependencies.yml +++ b/.github/workflows/_update_dependencies.yml @@ -9,12 +9,6 @@ on: secrets: PAT: required: true - AZURE_CLIENT_ID: - required: true - AZURE_TENANT_ID: - required: true - AZURE_SUBSCRIPTION_ID: - required: true SSH_KEY: required: true SSH_PWD: @@ -32,36 +26,31 @@ jobs: run: echo "date=$(date +%F)" | tee -a "$GITHUB_OUTPUT" update-lockfile: - environment: nemo-ci runs-on: linux-amd64-cpu16 needs: [pre-flight] env: SOURCE_BRANCH: ${{ needs.pre-flight.outputs.bump-branch }} TARGET_BRANCH: ${{ inputs.target-branch }} steps: - - name: Install Azure CLI - run: curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - - - name: Azure Login - uses: azure/login@v2 - with: - client-id: ${{ secrets.AZURE_CLIENT_ID }} - tenant-id: ${{ secrets.AZURE_TENANT_ID }} - subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - - - name: Azure ACR Login - run: az acr login --name nemoci - - name: Checkout repo uses: actions/checkout@v4 with: ref: ${{ env.TARGET_BRANCH }} + - name: Mock test data + run: mkdir -p assets/ + + - name: Fetch NGC Version + id: ngc-version + run: | + NGC_VERSION=$(cat docker/.ngc_version.dev) + echo "NGC_VERSION=${NGC_VERSION}" | tee -a "$GITHUB_OUTPUT" + - name: Build container env: GH_TOKEN: ${{ secrets.PAT }} run: | - docker build -f docker/Dockerfile.ci.dev --build-arg FROM_IMAGE_NAME="nvcr.io/nvidia/pytorch:25.06-py3" --target=main -t megatron-core . + docker build -f docker/Dockerfile.ci.dev --build-arg FROM_IMAGE_NAME="${{ steps.ngc-version.outputs.NGC_VERSION }}" --target=main -t megatron-core . - name: Create bump branch if not exists run: | @@ -96,7 +85,6 @@ jobs: create-pr: needs: [update-lockfile, pre-flight] runs-on: ubuntu-latest - environment: main env: SOURCE_BRANCH: ${{ needs.pre-flight.outputs.bump-branch }} TARGET_BRANCH: ${{ inputs.target-branch }} @@ -107,18 +95,6 @@ jobs: token: ${{ secrets.PAT }} ref: ${{ env.TARGET_BRANCH }} - - name: Install GPG - run: sudo apt-get install -y gnupg2 - - - name: Import GPG key (for signing) - uses: crazy-max/ghaction-import-gpg@e89d40939c28e39f97cf32126055eeae86ba74ec - id: gpg-action - with: - gpg_private_key: ${{ secrets.SSH_KEY }} - passphrase: ${{ secrets.SSH_PWD }} - git_user_signingkey: true - git_commit_gpgsign: true - - name: Rebase against ${{ env.SOURCE_BRANCH }} run: | if git ls-remote --exit-code origin ${{ env.SOURCE_BRANCH }}; then @@ -150,4 +126,102 @@ jobs: 🙏 Please merge this PR only if the CI workflow completed successfully. commit-message: ${{ env.title }} signoff: true - committer: "${{ steps.gpg-action.outputs.name }} <${{ steps.gpg-action.outputs.email }}>" + committer: "github-actions[bot] " + + - name: Post /ok to test comment + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + PR_NUMBER="${{ steps.create-pull-request.outputs.pull-request-number }}" + if [ -z "$PR_NUMBER" ]; then + echo "No PR was created, skipping comment" + exit 0 + fi + SHA="${{ steps.create-pull-request.outputs.pull-request-head-sha }}" + gh pr comment "$PR_NUMBER" --body "/ok to test $SHA" + + - name: Wait for CI checks + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + PR_NUMBER="${{ steps.create-pull-request.outputs.pull-request-number }}" + if [ -z "$PR_NUMBER" ]; then + echo "No PR was created, skipping wait" + exit 0 + fi + + # Fetch required status checks from branch protection rules + REQUIRED_CHECKS=$(gh api \ + "repos/${{ github.repository }}/branches/${{ env.TARGET_BRANCH }}/protection/required_status_checks" \ + --jq '.checks[].context' 2>/dev/null \ + || gh api \ + "repos/${{ github.repository }}/branches/${{ env.TARGET_BRANCH }}/protection/required_status_checks" \ + --jq '.contexts[]' 2>/dev/null \ + || true) + + if [ -z "$REQUIRED_CHECKS" ]; then + echo "No branch protection rules found for ${{ env.TARGET_BRANCH }}, skipping wait" + exit 0 + fi + + echo "Required checks from branch protection:" + echo "$REQUIRED_CHECKS" + + echo "Waiting for required checks to complete on PR #$PR_NUMBER..." + i=0 + INITIALIZED=false + while true; do + i=$((i + 1)) + CHECKS_JSON=$(gh pr checks "$PR_NUMBER" --json name,state 2>/dev/null || echo "[]") + ALL_DONE=true + FAILED_CHECKS="" + while IFS= read -r check; do + CHECK_STATE=$(echo "$CHECKS_JSON" | jq -r --arg name "$check" '.[] | select(.name == $name) | .state // ""' | tr '[:upper:]' '[:lower:]') + case "$CHECK_STATE" in + *success*|*pass*|*skip*|*neutral*) ;; + *pending*|*queued*|*progress*|*waiting*|*request*|"") + ALL_DONE=false + INITIALIZED=true + break + ;; + *) + if [ "$INITIALIZED" = "true" ]; then + FAILED_CHECKS="${FAILED_CHECKS} - ${check} (${CHECK_STATE})"$'\n' + else + ALL_DONE=false + fi + ;; + esac + done <<< "$REQUIRED_CHECKS" + if [ "$ALL_DONE" = "true" ]; then + if [ -n "$FAILED_CHECKS" ]; then + echo "Required check(s) did not pass:" + echo "$FAILED_CHECKS" + exit 1 + fi + echo "All required checks passed!" + break + fi + echo "Checks not yet complete (attempt $i), retrying in 30s..." + sleep 30 + done + + - name: Merge PR + env: + title: "chore(beep boop 🤖): Bump `uv.lock` (${{ env.TARGET_BRANCH}}) (${{ needs.pre-flight.outputs.date }})" + run: | + PR_NUMBER="${{ steps.create-pull-request.outputs.pull-request-number }}" + if [ -z "$PR_NUMBER" ]; then + echo "No PR was created, skipping merge" + exit 0 + fi + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git fetch origin ${{ env.SOURCE_BRANCH }} + git fetch origin ${{ env.TARGET_BRANCH }} + git checkout ${{ env.TARGET_BRANCH }} + git merge --squash origin/${{ env.SOURCE_BRANCH }} + git commit -m "${{ env.title }}" + git pull --rebase origin ${{ env.TARGET_BRANCH }} + git push origin ${{ env.TARGET_BRANCH }} + git push origin --delete ${{ env.SOURCE_BRANCH }} diff --git a/.github/workflows/auto-assign-milestone.yml b/.github/workflows/auto-assign-milestone.yml index 8153728f9fd..b972329bac1 100644 --- a/.github/workflows/auto-assign-milestone.yml +++ b/.github/workflows/auto-assign-milestone.yml @@ -13,7 +13,6 @@ permissions: jobs: assign-milestone: runs-on: ubuntu-latest - environment: nemo-ci if: github.repository == 'NVIDIA/Megatron-LM' steps: - name: Get PR info diff --git a/.github/workflows/auto-reminder-bot.yml b/.github/workflows/auto-reminder-bot.yml index c3aa8169b50..37e6e5498e3 100644 --- a/.github/workflows/auto-reminder-bot.yml +++ b/.github/workflows/auto-reminder-bot.yml @@ -9,7 +9,6 @@ on: jobs: run-script: - environment: main name: Run Auto Reminder Bot runs-on: ubuntu-latest if: github.repository == 'NVIDIA/Megatron-LM' @@ -28,7 +27,7 @@ jobs: - name: Run Auto Reminder Bot run: | - export SLACK_TOKEN=${{ secrets.SLACK_TOKEN }} - export SLACK_WEBHOOK_URL=${{ secrets.SLACK_WEBHOOK_URL }} + export SLACK_TOKEN=${{ secrets.SLACK_BOT_TOKEN }} + export SLACK_WEBHOOK_URL=${{ secrets.SLACK_REVIEW_REMINDER_CHANNEL_WEBHOOK }} export GH_TOKEN=${{ secrets.PAT }} python tests/test_utils/python_scripts/auto_reminder_github.py diff --git a/.github/workflows/auto-update-copy-pr-bot.yml b/.github/workflows/auto-update-copy-pr-bot.yml index 5f6f1ade9e8..3358a747f34 100644 --- a/.github/workflows/auto-update-copy-pr-bot.yml +++ b/.github/workflows/auto-update-copy-pr-bot.yml @@ -3,12 +3,11 @@ name: Auto Update Copy PR Bot on: workflow_dispatch: schedule: - - cron: '0 0 * * *' + - cron: "0 0 * * *" jobs: auto-update-copy-pr-bot: runs-on: ubuntu-latest - environment: nemo-ci if: github.repository == 'NVIDIA/Megatron-LM' steps: - name: Checkout code diff --git a/.github/workflows/build-test-publish-wheel.yml b/.github/workflows/build-test-publish-wheel.yml index bca859d0e61..00711b50806 100644 --- a/.github/workflows/build-test-publish-wheel.yml +++ b/.github/workflows/build-test-publish-wheel.yml @@ -17,10 +17,9 @@ name: Build, test, and publish a PyPi wheel (to testpypi). on: push: branches: - - dev - main - - 'pull-request/[0-9]+' - - 'deploy-release/*' + - "pull-request/[0-9]+" + - "deploy-release/*" merge_group: types: [checks_requested] @@ -34,7 +33,7 @@ permissions: jobs: pre-flight: - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.5 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2 if: github.repository == 'NVIDIA/Megatron-LM' build-test-publish-wheels: @@ -43,8 +42,7 @@ jobs: with: no-publish: true secrets: - TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} - TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} + TWINE_PASSWORD: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SVC_PYPI_TOKEN || secrets.SVC_PYPI_TEST_TOKEN }} build-test-publish-wheel-summary: needs: [pre-flight, build-test-publish-wheels] @@ -66,7 +64,7 @@ jobs: env: GH_TOKEN: ${{ github.token }} GITHUB_RUN_ID: ${{ github.run_id }} - SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' || github.ref != 'refs/heads/main' }} + SKIPPING_IS_ALLOWED: true run: | FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0 diff --git a/.github/workflows/check_api_backwards_compatibility_workflow.yml b/.github/workflows/check_api_backwards_compatibility_workflow.yml deleted file mode 100644 index 4ba0ed2780c..00000000000 --- a/.github/workflows/check_api_backwards_compatibility_workflow.yml +++ /dev/null @@ -1,273 +0,0 @@ -name: API Compatibility Check - -on: - push: - branches: - - dev - - main - - 'pull-request/[0-9]+' - - 'deploy-release/*' - merge_group: - types: [checks_requested] - - # Allow manual trigger - workflow_dispatch: - inputs: - baseline: - description: 'Baseline git reference (tag/branch/commit)' - required: true - -jobs: - pre-flight: - name: Pre-flight check - runs-on: ubuntu-latest - outputs: - should_skip: ${{ steps.check_files.outputs.should_skip }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Check if relevant files changed - id: check_files - run: | - # For manual triggers, never skip - if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then - echo "should_skip=false" >> $GITHUB_OUTPUT - echo "Manual trigger - will run compatibility check" - exit 0 - fi - - # Determine base SHA based on event type - if [ "${{ github.event_name }}" == "merge_group" ]; then - BASE_SHA="${{ github.event.merge_group.base_sha }}" - echo "Merge group event - comparing against base: $BASE_SHA" - else - # For push events, use merge-base to find common ancestor - # This ensures we only detect changes actually made in this PR branch, - # not changes that happened in dev after the branch was created - BASE_SHA=$(git merge-base origin/dev HEAD 2>/dev/null || echo "") - if [ -z "$BASE_SHA" ]; then - # Fallback for branches targeting main - BASE_SHA=$(git merge-base origin/main HEAD 2>/dev/null || echo "") - fi - echo "Push event - comparing against merge-base: $BASE_SHA" - fi - - if [ -z "$BASE_SHA" ]; then - echo "Could not determine base SHA - will run compatibility check" - echo "should_skip=false" >> $GITHUB_OUTPUT - exit 0 - fi - - # Check for changes in megatron/core Python files (excluding tests and legacy) - # Note: Using both *.py and **/*.py to match files at root and in subdirectories - CHANGED_FILES=$(git diff --name-only "$BASE_SHA" HEAD -- \ - 'megatron/core/*.py' \ - 'megatron/core/**/*.py' \ - ':!megatron/core/tests/**' \ - ':!megatron/legacy/**' 2>/dev/null || echo "") - - if [ -z "$CHANGED_FILES" ]; then - echo "should_skip=true" >> $GITHUB_OUTPUT - echo "No relevant megatron/core files changed - will skip compatibility check" - else - echo "should_skip=false" >> $GITHUB_OUTPUT - echo "Relevant files changed:" - echo "$CHANGED_FILES" - fi - - check-compatibility: - needs: [pre-flight] - if: needs.pre-flight.outputs.should_skip != 'true' - name: "OPTIONAL: Check API Backward Compatibility" - runs-on: ubuntu-latest - - # ============================================================================ - # Configuration Parameters (modify here) - # ============================================================================ - env: - # Default baseline for automatic PR checks - # Can be: branch name (e.g., 'main'), commit hash, or tag - # Will be resolved to commit hash during execution - DEFAULT_BASELINE: 'ed804b49860201e7103ce0f9c1129a330a384a65' - # Tag pattern for auto-detection (e.g., 'core_r*', 'core_v*') - TAG_PATTERN: 'core_v*' - # Tag regex filter (e.g., '^core_v[0-9]+\.[0-9]+\.[0-9]+$' for stable versions only) - TAG_REGEX_FILTER: '^core_v[0-9]+\.[0-9]+\.[0-9]+$' - # ============================================================================ - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 # Need full history to access baseline ref - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Install griffe - run: | - python -m pip install --upgrade pip - python -m pip install griffe - python -c "import griffe; print('Griffe installed successfully')" - python -c "from griffe import Object; print('Object import successful')" || echo "Object import from griffe failed" - python -c "from griffe.dataclasses import Object; print('Object import from dataclasses successful')" || echo "Object import from dataclasses failed" - - - name: Determine baseline reference - id: baseline - run: | - if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then - # Use manually specified baseline (branch, tag, or commit hash) - BASELINE_REF="${{ github.event.inputs.baseline }}" - else - # Use the configured default baseline - BASELINE_REF="${{ env.DEFAULT_BASELINE }}" - - # Uncomment below to auto-detect from tags instead: - # BASELINE_REF=$(git tag -l '${{ env.TAG_PATTERN }}' | grep -E '${{ env.TAG_REGEX_FILTER }}' | sort -V | tail -1) - # if [ -z "$BASELINE_REF" ]; then - # echo "Warning: No tags matching pattern found. Using default: ${{ env.DEFAULT_BASELINE }}" >&2 - # BASELINE_REF="${{ env.DEFAULT_BASELINE }}" - # fi - fi - - # Resolve baseline to commit hash (works for branches, tags, or commit hashes) - BASELINE_HASH=$(git rev-parse "$BASELINE_REF") - - echo "baseline=$BASELINE_HASH" >> $GITHUB_OUTPUT - echo "Using baseline: $BASELINE_REF (resolved to commit: $BASELINE_HASH)" - - - name: Run compatibility check - id: compat_check - run: | - # Save output to file for later display - python scripts/check_api_backwards_compatibility.py \ - --baseline ${{ steps.baseline.outputs.baseline }} \ - --verbose 2>&1 | tee compat_check_output.txt - - # Capture exit code - EXIT_CODE=${PIPESTATUS[0]} - echo "exit_code=$EXIT_CODE" >> $GITHUB_OUTPUT - exit $EXIT_CODE - continue-on-error: true - - - name: Fail job if breaking changes detected - if: steps.compat_check.outcome == 'failure' - run: | - echo "" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "🔍 WHAT IS THIS CHECK?" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "" - echo "This check ensures that changes to Megatron Core's public API do not" - echo "break backward compatibility for users. It compares your PR against" - echo "the latest stable release to detect breaking changes in:" - echo "" - echo " • Function signatures (parameters, order, types)" - echo " • Class structures and methods" - echo " • Return types and public interfaces" - echo "" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "🛠️ HOW TO FIX THIS" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "" - echo "Choose ONE of these resolution strategies:" - echo "" - echo "1️⃣ REVERT THE BREAKING CHANGE (Recommended)" - echo " → Modify your code to preserve backward compatibility" - echo " → Add new parameters as optional (with defaults)" - echo " → Keep existing parameters in the same order" - echo "" - echo "2️⃣ MARK AS INTERNAL API (If this is internal code)" - echo " → Add @internal_api decorator from megatron.core.utils" - echo "" - echo " Example (for classes):" - echo " from megatron.core.utils import internal_api" - echo "" - echo " @internal_api" - echo " class ExperimentalFeature:" - echo " pass" - echo "" - echo " Example (for functions):" - echo " from megatron.core.utils import internal_api" - echo "" - echo " @internal_api" - echo " def internal_helper_function():" - echo " pass" - echo "" - echo "3️⃣ MARK AS EXPERIMENTAL API (If this is experimental code)" - echo " → Add @experimental_api decorator from megatron.core.utils" - echo "" - echo " Example:" - echo " from megatron.core.utils import experimental_api" - echo "" - echo " @experimental_api" - echo " class ExperimentalFeature:" - echo " pass" - echo "" - echo "4️⃣ USE DEPRECATION (For gradual API changes)" - echo " → Add @deprecated decorator for transition period" - echo " → Example:" - echo " from megatron.core.utils import deprecated" - echo "" - echo " @deprecated(version='1.0', removal_version='2.0'," - echo " alternative='new_function')" - echo " def old_function():" - echo " pass" - echo "" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "📋 BREAKING CHANGES DETECTED" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "" - cat compat_check_output.txt - echo "" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "📚 MORE INFORMATION" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "" - echo "📖 Full documentation: docs/api-backwards-compatibility-check.md" - echo "🔧 Checker script: scripts/check_api_backwards_compatibility.py" - echo "❓ Questions? Check the docs or ask in #megatron-core" - echo "" - - echo "::error::Breaking API changes detected. Please review the output above and choose a resolution strategy." - exit 1 - - - name: Success message - if: steps.compat_check.outcome == 'success' - run: | - echo "::notice::✅ No breaking API changes detected!" - - api-backward-compatibility-summary: - needs: [pre-flight, check-compatibility] - runs-on: ubuntu-latest - name: "OPTIONAL: API Backward Compatibility Check Summary" - if: always() && !cancelled() - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Validate workflow result - shell: bash -x -e -u -o pipefail {0} - env: - GH_TOKEN: ${{ github.token }} - SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.should_skip == 'true' }} - run: | - FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success" and .name != "OPTIONAL: API Backward Compatibility Check Summary")] | length') || echo 0 - - if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then - if [ "$SKIPPING_IS_ALLOWED" == "true" ]; then - echo "✅ Compatibility check was skipped (no relevant files changed)" - else - echo "✅ All checks passed successfully" - fi - exit 0 - else - echo "❌ Found $FAILED_JOBS failed job(s)" - gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success" and .name != "OPTIONAL: API Backward Compatibility Check Summary") | .name' - exit 1 - fi diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index 58b447939a7..9da305f07e6 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -17,7 +17,6 @@ on: push: branches: - main - - dev jobs: cherry-pick: @@ -27,5 +26,5 @@ jobs: target-branches-pattern: 'core_(*dev_)?r[0-9]+\.[0-9]+\.[0-9]+' secrets: PAT: ${{ secrets.PAT }} - SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }} - SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_TEAM_GROUP_ID }} + SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }} diff --git a/.github/workflows/cicd-approve-test-queue.yml b/.github/workflows/cicd-approve-test-queue.yml index 1c35031cb35..2cba41eafb8 100644 --- a/.github/workflows/cicd-approve-test-queue.yml +++ b/.github/workflows/cicd-approve-test-queue.yml @@ -155,8 +155,6 @@ jobs: workflow_id = workflow["id"] workflow_name = workflow["display_title"] - pr_info = workflow.get("pull_requests", [{}])[0] - pr_number = pr_info.get("number", "unknown") print(f"Approving workflow {workflow_name} with Run Id: {workflow_id}") deployment_url = f"actions/runs/{workflow_id}/pending_deployments" @@ -183,8 +181,8 @@ jobs: steps: - name: Notify env: - SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} - SLACK_WEBHOOK_ADMIN: + SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }} + SLACK_WEBHOOK_ADMIN: GITHUB_RUN_ID: ${{ github.run_id }} GITHUB_REPOSITORY: ${{ github.repository }} run: | diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 3aff7995099..2cc025baf99 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -77,6 +77,9 @@ jobs: run: | # Skip SSO check for scheduled jobs, main branch, dev branch, or merge groups if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_DEV_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then + echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT + exit 0 + fi # Use SSO membership check result IS_MEMBER="${{ steps.check-sso.outputs.is_member }}" @@ -126,7 +129,7 @@ jobs: pre-flight: needs: [is-not-external-contributor] if: github.repository == 'NVIDIA/Megatron-LM' - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.10 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2 linting: runs-on: ubuntu-latest @@ -186,6 +189,127 @@ jobs: echo "Running CI tests" echo "is_merge_group: ${{ needs.pre-flight.outputs.is_merge_group }}" + cicd-parse-downstream-testing: + runs-on: ubuntu-latest + needs: + - pre-flight + - cicd-wait-in-queue + if: | + needs.pre-flight.result != 'cancelled' + && needs.cicd-wait-in-queue.result != 'cancelled' + && ( + success() + || needs.pre-flight.outputs.is_ci_workload == 'true' + || needs.pre-flight.outputs.force_run_all == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + ) + && !cancelled() + outputs: + mbridge-test-suite: ${{ steps.select-mbridge-test-suite.outputs.main }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Get PR info + id: get-pr-info + if: startsWith(github.ref, 'refs/heads/pull-request/') + uses: nv-gha-runners/get-pr-info@main + + - name: Select MBridge test suite + id: select-mbridge-test-suite + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} + TEST_SUITE=$(gh pr view $PR_NUMBER --json labels | jq -r 'if [.labels[].name] | any(. == "Run MBridge tests") then "all" else "unit-only" end') + echo "main=$TEST_SUITE" | tee -a $GITHUB_OUTPUT + + - name: How-To + run: bash .github/scripts/readme.sh + + cicd-mbridge-testing: + runs-on: ubuntu-latest + needs: + - pre-flight + - cicd-wait-in-queue + - cicd-parse-downstream-testing + if: | + needs.pre-flight.result != 'cancelled' + && needs.cicd-wait-in-queue.result != 'cancelled' + && needs.cicd-parse-downstream-testing.result != 'cancelled' + && ( + success() + || needs.pre-flight.outputs.is_ci_workload == 'true' + || needs.pre-flight.outputs.force_run_all == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + ) + && !cancelled() + steps: + - name: Get PR info + id: get-pr-info + if: startsWith(github.ref, 'refs/heads/pull-request/') + uses: nv-gha-runners/get-pr-info@main + + - name: Checkout MBridge and create testing branch + uses: actions/checkout@v4 + with: + ref: main + repository: NVIDIA-NeMo/Megatron-Bridge + path: megatron-bridge + token: ${{ secrets.PAT }} + + - name: Create testing branch + env: + MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }} + run: | + cd megatron-bridge + git fetch origin main + git checkout -b ${{ env.MBRIDGE_BRANCH_NAME }} origin/main + git push origin ${{ env.MBRIDGE_BRANCH_NAME }} --force + + - name: Get merge commit sha + shell: bash -x -e -u -o pipefail {0} + id: sha + env: + IS_PR: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }} + IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }} + run: | + if [[ "$IS_PR" == "true" ]]; then + SHA=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').merge_commit_sha }} + elif [[ "$IS_MERGE_GROUP" == "true" ]]; then + SHA=${{ github.event.merge_group.head_sha }} + else + SHA=${GITHUB_SHA} + fi + echo "main=${SHA}" | tee -a "$GITHUB_OUTPUT" + + - name: Trigger MBridge tests + uses: convictional/trigger-workflow-and-wait@v1.6.5 + env: + MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }} + with: + owner: NVIDIA-NeMo + repo: Megatron-Bridge + workflow_file_name: cicd-main.yml + github_token: ${{ secrets.PAT }} + ref: ${{ env.MBRIDGE_BRANCH_NAME }} + wait_interval: 60 + propagate_failure: true + client_payload: | + { + "mcore_ref": "${{ steps.sha.outputs.main }}", + "test_suite": "${{ needs.cicd-parse-downstream-testing.outputs.mbridge-test-suite }}", + "triggered_by": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + } + + - name: Delete testing branch + if: always() + env: + MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }} + run: | + cd megatron-bridge + git push origin --delete ${{ env.MBRIDGE_BRANCH_NAME }} + cicd-container-build: needs: [is-not-external-contributor, pre-flight, cicd-wait-in-queue] runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }} @@ -350,7 +474,7 @@ jobs: - name: Parse unit tests id: parse-unit-tests run: | - cat tests/test_utils/recipes/unit-tests.yaml | yq -o json '[.products[].test_case[] | { "bucket": .}] | sort_by(.model, .test_case)' | jq -c > unit-tests.json + cat tests/test_utils/recipes/h100/unit-tests.yaml | yq -o json '[.products[].test_case[] | { "bucket": .}] | sort_by(.model, .test_case)' | jq -c > unit-tests.json echo "unit-tests=$(cat unit-tests.json)" | tee -a $GITHUB_OUTPUT cicd-unit-tests-latest: @@ -568,8 +692,8 @@ jobs: GITHUB_RUN_ID: ${{ github.run_id }} SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }} run: | - FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion == "failure" and .name != "merge-queue-notification")] | length') || echo 0 - SKIPPED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion == "skipped" and .name != "merge-queue-notification")] | length') || echo 0 + FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion == "failure" and .name != "merge-queue-notification" and .name != "cicd-mbridge-testing")] | length') || echo 0 + SKIPPED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion == "skipped" and .name != "merge-queue-notification" and .name != "cicd-mbridge-testing")] | length') || echo 0 if [ "${FAILED_JOBS:-0}" -eq 0 ] && ([ "${SKIPPED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]); then echo "✅ All previous jobs completed successfully" diff --git a/.github/workflows/config/changelog-config.json b/.github/workflows/config/changelog-config.json new file mode 100644 index 00000000000..e640b90a0f3 --- /dev/null +++ b/.github/workflows/config/changelog-config.json @@ -0,0 +1,24 @@ +{ + "categories": [], + "ignore_labels": [ + "ignore" + ], + "sort": "ASC", + "template": "\n${{CHANGELOG}}\n\n
Changelog Details\n\n${{UNCATEGORIZED}}\n
\n", + "pr_template": "- ${{TITLE}} by @${{AUTHOR}} :: PR: #${{NUMBER}}", + "commit_template": "- ${{TITLE}} by @${{AUTHOR}}", + "empty_template": "${{OWNER}}\n${{REPO}}\n${{FROM_TAG}}\n${{TO_TAG}}", + "duplicate_filter": { + "pattern": ".+", + "on_property": "title", + "method": "match" + }, + "transformers": [], + "max_tags_to_fetch": 100, + "max_pull_requests": 500, + "max_back_track_time_days": 365, + "exclude_merge_branches": [], + "tag_resolver": { + "method": "semver" + } +} diff --git a/.github/workflows/copyright-check.yml b/.github/workflows/copyright-check.yml index ac0d49daf9a..a7f51cd8a0e 100644 --- a/.github/workflows/copyright-check.yml +++ b/.github/workflows/copyright-check.yml @@ -17,14 +17,14 @@ name: Copyright check on: push: branches: - - 'pull-request/[0-9]+' - - 'deploy-release/*' + - "pull-request/[0-9]+" + - "deploy-release/*" merge_group: types: [checks_requested] jobs: pre-flight: - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.10 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2 if: github.repository == 'NVIDIA/Megatron-LM' copyright-check: @@ -48,8 +48,13 @@ jobs: && github.repository == 'NVIDIA/Megatron-LM' runs-on: ubuntu-latest steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Result env: + GH_TOKEN: ${{ github.token }} + GITHUB_RUN_ID: ${{ github.run_id }} SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }} run: | FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0 diff --git a/.github/workflows/dependabot.yml b/.github/workflows/dependabot.yml index 9dc1e6ac5a9..81a5cd57d12 100644 --- a/.github/workflows/dependabot.yml +++ b/.github/workflows/dependabot.yml @@ -11,7 +11,6 @@ permissions: jobs: get-release-branch-names: runs-on: ubuntu-latest - environment: nemo-ci outputs: mcore: ${{ steps.get-branch.outputs.mcore_release_branch }} if: github.repository == 'NVIDIA/Megatron-LM' @@ -21,11 +20,11 @@ jobs: env: PAT: ${{ secrets.PAT }} run: | - latest_branch=$(git ls-remote --heads https://token:${PAT}@github.com/NVIDIA-NeMo/Eval.git 'refs/heads/r*' | + latest_branch=$(git ls-remote --heads https://token:${PAT}@github.com/NVIDIA/Megatron-LM.git 'refs/heads/core_r*' | grep -o 'core_r[0-9]\+\.[0-9]\+\.[0-9]\+' | sort -V | tail -n1) - echo "mcore_release_branch=$latest_branch" >> $GITHUB_OUTPUT + echo "mcore_release_branch=$latest_branch" | tee -a $GITHUB_OUTPUT bump-tags: needs: [get-release-branch-names] @@ -41,9 +40,6 @@ jobs: target-branch: ${{ matrix.target-branch }} secrets: PAT: ${{ secrets.PAT }} - AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} - AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} - AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} SSH_KEY: ${{ secrets.SSH_KEY }} SSH_PWD: ${{ secrets.SSH_PWD }} @@ -54,8 +50,8 @@ jobs: steps: - name: Notify env: - SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} - SLACK_WEBHOOK_ADMIN: + SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }} + SLACK_WEBHOOK_ADMIN: GITHUB_RUN_ID: ${{ github.run_id }} GITHUB_REPOSITORY: ${{ github.repository }} run: | diff --git a/.github/workflows/install-test.yml b/.github/workflows/install-test.yml index ece9184ee94..5a0abb8596d 100644 --- a/.github/workflows/install-test.yml +++ b/.github/workflows/install-test.yml @@ -22,14 +22,14 @@ on: branches: - dev - main - - 'pull-request/[0-9]+' - - 'deploy-release/*' + - "pull-request/[0-9]+" + - "deploy-release/*" merge_group: types: [checks_requested] jobs: pre-flight: - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.5 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2 if: github.repository == 'NVIDIA/Megatron-LM' pip-test-pytorch: @@ -43,11 +43,10 @@ jobs: name: Pip - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch container: image: nvcr.io/nvidia/pytorch:25.05-py3 - environment: nemo-ci strategy: fail-fast: false matrix: - python-version: ['3.12'] + python-version: ["3.12"] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -89,11 +88,10 @@ jobs: name: UV - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch container: image: nvcr.io/nvidia/pytorch:25.05-py3 - environment: nemo-ci strategy: fail-fast: false matrix: - python-version: ['3.12'] + python-version: ["3.12"] steps: - name: Checkout repository uses: actions/checkout@v4 diff --git a/.github/workflows/oncall-rotation.yml b/.github/workflows/oncall-rotation.yml index 71ae094e6c8..a621be7f652 100644 --- a/.github/workflows/oncall-rotation.yml +++ b/.github/workflows/oncall-rotation.yml @@ -17,7 +17,7 @@ name: Oncall Rotation on: schedule: # Runs at 09:00 UTC every Wednesday - - cron: '0 9 * * 3' + - cron: "0 9 * * 3" workflow_dispatch: permissions: @@ -25,7 +25,6 @@ permissions: jobs: rotate-schedule: - environment: main runs-on: ubuntu-latest steps: - name: Checkout code @@ -36,7 +35,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: "3.10" - name: Rotate Schedule env: @@ -59,4 +58,3 @@ jobs: git commit -m "chore: rotate oncall schedule" || echo "No changes to commit" git pull --rebase git push origin HEAD:main - diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml index d15ea74f052..6d619a8a1bc 100644 --- a/.github/workflows/release-docs.yml +++ b/.github/workflows/release-docs.yml @@ -20,23 +20,62 @@ on: required: true type: boolean default: true - version-number: - description: Version number to release this as (use `latest` for main branch) - required: true + publish-as-latest: + description: Publish as Latest stable version. + required: false + type: boolean + default: true + docs-version-override: + description: Docs version if commit is not tagged + required: false type: string + default: "" + update-version-picker: + description: Update version picker. + required: false + type: boolean + default: true notify-emails: description: Email addresses to send the notification to. Format as "me@me.com,you@you.com". + required: false + type: string + workflow_call: + inputs: + dry-run: + description: Whether to run the workflow in dry-run mode required: true + type: boolean + default: true + publish-as-latest: + description: Publish as Latest stable version. + required: false + type: boolean + default: true + docs-version-override: + description: Docs version if commit is not tagged + required: false + type: string + default: "" + update-version-picker: + description: Update version picker. + required: false + type: boolean + default: true + notify-emails: + description: Email addresses to send the notification to. Format as "me@me.com,you@you.com". + required: false type: string - aws-region: - description: AWS region + build-docs-ref: + description: Reference to build the docs from required: false type: string - default: us-east-1 + default: ${{ github.sha }} jobs: build-docs: uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.67.0 + with: + ref: ${{ inputs.build-docs-ref }} publish-docs: runs-on: ubuntu-latest @@ -45,7 +84,7 @@ jobs: - uses: actions/checkout@v6 with: repository: NVIDIA-NeMo/FW-CI-templates - ref: v0.67.2 + ref: v0.74.0 path: FW-CI-templates - uses: ./FW-CI-templates/.github/actions/publish-docs @@ -59,10 +98,12 @@ jobs: artifacts-name: docs-html artifacts-path: _build/html emails-csv: ${{ inputs.notify-emails && format('{0},{1}', vars.docs_release_emails, inputs.notify-emails) || vars.docs_release_emails }} - overwrite-latest-on-tag: false + overwrite-latest-on-tag: ${{ inputs.publish-as-latest }} + docs-version-override: ${{ inputs.docs-version-override }} + update-version-picker: ${{ inputs.update-version-picker }} run-on-version-tag-only: ${{ github.ref_name != 'main' }} request-name: megatron-core-publish-docs-${{ github.run_id }} - aws-region: ${{ inputs.aws-region }} + aws-region: ${{ vars.DOCS_AWS_REGION }} aws-role-to-assume: ${{ secrets.AWS_ASSUME_ROLE_ARN }} aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml index 82f26168bd6..dc4bad0a9a7 100644 --- a/.github/workflows/release-freeze.yml +++ b/.github/workflows/release-freeze.yml @@ -42,5 +42,5 @@ jobs: freeze-commit: ${{ inputs.freeze-commit }} dry-run: ${{ inputs.dry-run }} secrets: - SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }} - SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }} + SLACK_WEBHOOK: ${{ secrets.SLACK_MAIN_CHANNEL_WEBHOOK }} + SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_TEAM_GROUP_ID }} diff --git a/.github/workflows/release-nightly-docs.yml b/.github/workflows/release-nightly-docs.yml new file mode 100644 index 00000000000..89ceb1fbcd8 --- /dev/null +++ b/.github/workflows/release-nightly-docs.yml @@ -0,0 +1,29 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Release Nightly Docs + +on: + schedule: + - cron: "0 10 * * *" + +jobs: + call-release-docs: + uses: ./.github/workflows/release-docs.yml + with: + dry-run: false + publish-as-latest: false + docs-version-override: "nightly" + update-version-picker: false + secrets: inherit diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index aa04408689b..647e6af2379 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -name: 'Release Megatron-Core' +name: "Release Megatron-Core" on: workflow_dispatch: @@ -30,6 +30,16 @@ on: required: true default: true type: boolean + generate-changelog: + description: Generate changelog + required: false + default: true + type: boolean + publish-docs: + description: Publish docs + required: false + default: true + type: boolean version-bump-branch: description: Branch for version bump required: true @@ -47,9 +57,9 @@ jobs: dry-run: ${{ inputs.dry-run || false }} version-bump-branch: ${{ inputs.version-bump-branch || github.ref_name }} create-gh-release: ${{ inputs.create-gh-release || true }} + gh-release-use-changelog-builder: ${{ inputs.generate-changelog }} + publish-docs: ${{ inputs.publish-docs }} secrets: - TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} - TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} - SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }} - SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }} + TWINE_PASSWORD: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SVC_PYPI_TOKEN || secrets.SVC_PYPI_TEST_TOKEN }} + SLACK_WEBHOOK: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SLACK_MAIN_CHANNEL_WEBHOOK || secrets.SLACK_CI_CHANNEL_WEBHOOK }} PAT: ${{ secrets.PAT }} diff --git a/.github/workflows/sync-team-usergroups.yml b/.github/workflows/sync-team-usergroups.yml index 1c6cecaeb7a..fb48a6ca5d4 100644 --- a/.github/workflows/sync-team-usergroups.yml +++ b/.github/workflows/sync-team-usergroups.yml @@ -16,10 +16,11 @@ name: Sync GitHub Teams to Slack User Groups on: workflow_dispatch: + schedule: + - cron: "0 0 * * *" jobs: sync-usergroups: - environment: main runs-on: ubuntu-latest steps: - name: Checkout code @@ -28,7 +29,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: "3.10" - name: Sync Teams to User Groups env: diff --git a/.github/workflows/trigger-mbridge-tests.yml b/.github/workflows/trigger-mbridge-tests.yml index b1a3aa0089d..023851e966a 100644 --- a/.github/workflows/trigger-mbridge-tests.yml +++ b/.github/workflows/trigger-mbridge-tests.yml @@ -2,182 +2,41 @@ # SPDX-License-Identifier: Apache-2.0 name: Trigger MBridge Tests -# Remote testing of MBridge from MCore -# Triggers MBridge CI tests with current MCore commit to verify backward compatibility - on: - # Manual trigger only workflow_dispatch: inputs: mbridge_ref: - description: 'MBridge branch/ref to trigger' + description: "MBridge branch/ref to trigger" required: false type: string - default: 'main' - run_cicd_main: - description: 'Run cicd-main.yml (full CI/CD)' - required: false - type: boolean - default: true - run_install_test: - description: 'Run install-test.yml (quick install check)' - required: false - type: boolean - default: true + default: "main" test_suite: - description: 'Test suite to run (for cicd-main)' + description: "Test suite to run" required: false type: choice options: - - 'all' - - 'unit-only' - - 'functional-only' - default: 'all' + - "all" + - "unit-only" + - "functional-only" + default: "all" jobs: - # First job: Get MCore commit info (shared by all matrix jobs) - get-mcore-info: + trigger-mbridge-tests: runs-on: ubuntu-latest - outputs: - sha: ${{ steps.mcore_info.outputs.sha }} - short_sha: ${{ steps.mcore_info.outputs.short_sha }} - branch: ${{ steps.mcore_info.outputs.branch }} - repo_url: ${{ steps.mcore_info.outputs.repo_url }} steps: - - name: Checkout MCore - uses: actions/checkout@v4 + - name: Trigger MBridge tests + uses: convictional/trigger-workflow-and-wait@v1.6.5 with: - fetch-depth: 0 - - - name: Get MCore commit info - id: mcore_info - run: | - echo "sha=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT - echo "short_sha=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT - echo "branch=${GITHUB_REF#refs/heads/}" >> $GITHUB_OUTPUT - - # Get repo URL from origin remote, fallback to constructing from github context - REPO_URL=$(git remote get-url origin 2>/dev/null || echo "${{ github.server_url }}/${{ github.repository }}.git") - echo "repo_url=${REPO_URL}" >> $GITHUB_OUTPUT - - echo "📦 MCore commit: $(git rev-parse --short HEAD)" - echo "🌿 Branch: ${GITHUB_REF#refs/heads/}" - echo "📍 Repo: ${REPO_URL}" - - # Matrix job: Trigger and monitor MBridge workflows in parallel - trigger-and-monitor: - needs: [get-mcore-info] - runs-on: ubuntu-latest - continue-on-error: true # Don't fail workflow if monitoring times out - strategy: - fail-fast: false # Continue other matrix jobs even if one fails - matrix: - include: - - workflow: install-test.yml - name: Install Test - - workflow: cicd-main.yml - name: CI/CD Main - - name: ${{ matrix.name }} - - steps: - - name: Check if workflow should run - id: should_run - run: | - if [[ "${{ matrix.workflow }}" == "install-test.yml" && "${{ inputs.run_install_test }}" == "true" ]]; then - echo "run=true" >> $GITHUB_OUTPUT - elif [[ "${{ matrix.workflow }}" == "cicd-main.yml" && "${{ inputs.run_cicd_main }}" == "true" ]]; then - echo "run=true" >> $GITHUB_OUTPUT - else - echo "run=false" >> $GITHUB_OUTPUT - echo "⏭️ Skipping ${{ matrix.workflow }} (not enabled)" - fi - - - name: Trigger ${{ matrix.workflow }} - if: steps.should_run.outputs.run == 'true' - id: trigger - env: - GH_TOKEN: ${{ secrets.PAT }} - run: | - echo "🚀 Triggering ${{ matrix.workflow }} | MCore: ${{ needs.get-mcore-info.outputs.short_sha }} | MBridge: ${{ inputs.mbridge_ref }}" - - gh workflow run ${{ matrix.workflow }} \ - --repo NVIDIA-NeMo/Megatron-Bridge --ref ${{ inputs.mbridge_ref }} \ - --field mcore_commit=${{ needs.get-mcore-info.outputs.sha }} \ - --field mcore_branch=${{ needs.get-mcore-info.outputs.branch }} \ - --field mcore_repo=${{ needs.get-mcore-info.outputs.repo_url }} \ - --field test_suite=${{ inputs.test_suite }} \ - --field triggered_by=mcore-ci - - - name: Get run ID - if: steps.should_run.outputs.run == 'true' - id: get_run_id - env: - GH_TOKEN: ${{ secrets.PAT }} - run: | - sleep 10 # Wait for run to appear - RUN_ID=$(gh run list \ - --repo NVIDIA-NeMo/Megatron-Bridge \ - --workflow=${{ matrix.workflow }} \ - --limit 5 \ - --json databaseId,createdAt \ - --jq "sort_by(.createdAt) | reverse | .[0] | .databaseId") - - echo "run_id=${RUN_ID}" >> $GITHUB_OUTPUT - echo "📋 Run ID: ${RUN_ID}" - - cat >> $GITHUB_STEP_SUMMARY << EOF - ## 🔄 ${{ matrix.name }} Triggered - - **MCore:** \`${{ needs.get-mcore-info.outputs.short_sha }}\` | **MBridge:** \`${{ inputs.mbridge_ref }}\` | **Suite:** \`${{ inputs.test_suite }}\` - - - 🔄 [${{ matrix.workflow }}](https://github.com/NVIDIA-NeMo/Megatron-Bridge/actions/runs/${RUN_ID}) - Running... - - ⏳ Monitoring every 5 minutes until completion - - > **Note:** Tests run without approval when triggered from MCore - EOF - - - name: Monitor workflow - if: steps.should_run.outputs.run == 'true' - id: monitor - continue-on-error: true - env: - GH_TOKEN: ${{ secrets.PAT }} - run: | - RUN_ID="${{ steps.get_run_id.outputs.run_id }}" - echo "📊 Monitoring ${{ matrix.workflow }} (Run ID: ${RUN_ID})" - - gh run watch ${RUN_ID} --repo NVIDIA-NeMo/Megatron-Bridge --exit-status - - CONCLUSION=$(gh run view ${RUN_ID} --repo NVIDIA-NeMo/Megatron-Bridge --json conclusion --jq -r .conclusion) - echo "workflow_status=${CONCLUSION}" >> $GITHUB_ENV - echo "✅ Completed: ${CONCLUSION}" - - - name: Report results - if: always() && steps.should_run.outputs.run == 'true' - run: | - CONCLUSION="${{ env.workflow_status || 'unknown' }}" - RUN_ID="${{ steps.get_run_id.outputs.run_id }}" - - case "$CONCLUSION" in - "success") ICON="✅"; MSG="passed" ;; - "failure") ICON="❌"; MSG="failed"; EXIT_CODE=1 ;; - "cancelled") ICON="🚫"; MSG="cancelled"; EXIT_CODE=0 ;; - *) ICON="⏳"; MSG="still running or timed out"; EXIT_CODE=0 ;; - esac - - cat >> $GITHUB_STEP_SUMMARY << EOF - ## 📊 ${{ matrix.name }} Results - - ### ${ICON} ${{ matrix.workflow }} - **Status:** \`${CONCLUSION}\` - - [View full results →](https://github.com/NVIDIA-NeMo/Megatron-Bridge/actions/runs/${RUN_ID}) - - --- - *Triggered from MCore \`${{ needs.get-mcore-info.outputs.short_sha }}\`* - EOF - - echo "${ICON} ${{ matrix.name }} ${MSG}" - exit ${EXIT_CODE:-0} - + owner: NVIDIA-NeMo + repo: Megatron-Bridge + workflow_file_name: cicd-main.yml + github_token: ${{ secrets.PAT }} + ref: ${{ inputs.mbridge_ref }} + wait_interval: 60 + propagate_failure: true + client_payload: | + { + "mcore_ref": "${{ github.sha }}", + "test_suite": "${{ inputs.test_suite }}", + "triggered_by": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + } diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml index ff9e4e5178b..e00ce8afc36 100644 --- a/.gitlab/stages/00.pre.yml +++ b/.gitlab/stages/00.pre.yml @@ -39,6 +39,9 @@ pre:create_ci_branches: - branch: ci-approve-main - branch: ci-approve-dev - branch: ci-sync-branches + - branch: ci-testing-1 + - branch: ci-testing-2 + - branch: ci-testing-3 tags: - arch/amd64 - env/prod @@ -49,7 +52,7 @@ pre:create_ci_branches: stage: .pre image: python:3.10 variables: - GIT_STRATEGY: 'clone' + GIT_STRATEGY: "clone" script: - git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/adlr/megatron-lm.git" - git switch --force-create $branch @@ -80,7 +83,7 @@ pre:create_ci_branches_dev: stage: .pre image: python:3.10 variables: - GIT_STRATEGY: 'clone' + GIT_STRATEGY: "clone" script: - git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/adlr/megatron-lm.git" - git switch --force-create $branch @@ -137,7 +140,7 @@ pre:maybe_cherry_pick_to_main: stage: .pre image: nentangso/alpine-git-curl-jq variables: - GIT_STRATEGY: 'clone' + GIT_STRATEGY: "clone" script: - | set -x @@ -202,7 +205,7 @@ pre:maybe_cherry_pick_commit: stage: .pre image: nentangso/alpine-git-curl-jq variables: - GIT_STRATEGY: 'clone' + GIT_STRATEGY: "clone" script: - set -x - set +e diff --git a/.gitlab/stages/01.build.yml b/.gitlab/stages/01.build.yml index 20252e7d045..61521295a93 100644 --- a/.gitlab/stages/01.build.yml +++ b/.gitlab/stages/01.build.yml @@ -121,6 +121,7 @@ test:build_image: KUBERNETES_SERVICE_MEMORY_LIMIT: 90Gi SHARED_PATH: /builds/$CI_PROJECT_PATH/shared script: + - apk add skopeo - | set -x @@ -132,6 +133,11 @@ test:build_image: ${IMAGE}:${CI_PIPELINE_ID}-arm64 docker manifest push ${IMAGE}:${CI_PIPELINE_ID} + + if [[ "$CI_COMMIT_BRANCH" == "ci-rebuild-mcore-nemo-image" || "$CI_COMMIT_BRANCH" == "main" || "$CI_COMMIT_BRANCH" == "dev" ]]; then + skopeo copy --all docker://${IMAGE}:${CI_PIPELINE_ID} docker://${IMAGE}:${CI_COMMIT_BRANCH} + fi + - echo "MCORE_MR_COMMIT=$CI_COMMIT_SHA" | tee -a build.env - echo "MCORE_BACKWARDS_COMMIT=$MCORE_BACKWARDS_COMMIT" | tee -a build.env - cat build.env diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index 33dd8d7a5fb..a324ce037fb 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -211,31 +211,6 @@ test:unit_tests_notify: when: always - when: never -test:linting_docs_build: - extends: [.test_rules] - image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID} - tags: - - arch/amd64 - - env/prod - - origin/jet-fleet - - owner/jet-core - - purpose/utility - - team/megatron - needs: [test:build_image] - script: - - cd .. - - rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/nemo-megatron-core-tme/documentation.git - - mv megatron-lm/ documentation/ - - cd documentation/ - - ./repo docs - rules: - - if: $PUBLISH == "yes" - when: never - - if: $BUILD == "no" - when: never - - when: on_success - allow_failure: true - # Override from template secret_detection: rules: diff --git a/.gitlab/stages/03.integration-tests.yml b/.gitlab/stages/03.integration-tests.yml index d28ecd8e137..70fa345e513 100644 --- a/.gitlab/stages/03.integration-tests.yml +++ b/.gitlab/stages/03.integration-tests.yml @@ -141,18 +141,21 @@ integration:configure: integration:run_lts_dgx_a100: extends: [.integration_run] + allow_failure: true variables: ENVIRONMENT: lts CLUSTER: A100 integration:run_lts_dgx_h100: extends: [.integration_run] + allow_failure: true variables: ENVIRONMENT: lts CLUSTER: H100 integration:run_lts_dgx_gb200: extends: [.integration_run] + allow_failure: true variables: ENVIRONMENT: lts CLUSTER: GB200 diff --git a/.gitlab/stages/04.functional-tests.yml b/.gitlab/stages/04.functional-tests.yml index d32ff86a344..002c96e7c0f 100644 --- a/.gitlab/stages/04.functional-tests.yml +++ b/.gitlab/stages/04.functional-tests.yml @@ -168,18 +168,21 @@ functional:configure: functional:run_lts_dgx_a100: extends: [.functional_run] + allow_failure: true variables: ENVIRONMENT: lts CLUSTER: A100 functional:run_lts_dgx_h100: extends: [.functional_run] + allow_failure: true variables: ENVIRONMENT: lts CLUSTER: H100 functional:run_lts_dgx_gb200: extends: [.functional_run] + allow_failure: true variables: ENVIRONMENT: lts CLUSTER: GB200 @@ -205,15 +208,15 @@ functional:run_dev_dgx_gb200: functional:run_nemo: extends: [.functional_tests_rules] trigger: - project: 'dl/joc/nemo-ci' + project: "dl/joc/nemo-ci" branch: main-mirror strategy: depend inherit: variables: true variables: - MCORE_MR_COMMIT: $CI_COMMIT_SHA - TEST_NEMO2_MODULE: 'True' - ALLOW_FAILURE_DEPENDENCY: 'True' + MCORE_COMMIT: $CI_COMMIT_SHA + TEST_NEMO2_MODULE: "True" + ALLOW_FAILURE_DEPENDENCY: "True" TESTS_TO_RUN_ON_THIS_COMMIT: nightly rules: - if: $FUNCTIONAL_TEST == "yes" @@ -229,6 +232,8 @@ functional:x_notify: - functional:run_dev_dgx_a100 - functional:run_lts_dgx_h100 - functional:run_dev_dgx_h100 + - functional:run_lts_dgx_gb200 + - functional:run_dev_dgx_gb200 tags: - arch/amd64 - env/prod diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index babdc18b8a4..00000000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,368 +0,0 @@ -# Changelog - -## NVIDIA Megatron Core 0.15.0 - -* Features - * Performance - * Fused QKV preprocessing with precomputed RoPE caches (3x preprocessing speedup, 10-14% E2E) ([MR \!3912](https://github.com/NVIDIA/Megatron-LM/commit/f0d9fa97fead9825ae3eada36ee2df568bfa415b)) - * Use new TE interface for user buffers ([MR \!3886](https://github.com/NVIDIA/Megatron-LM/commit/d47b83807142b6490c7a000e63d25a479b106fd9)) - * Add CPU activation offloading via TE ([MR \!4286](https://github.com/NVIDIA/Megatron-LM/commit/310671436c36e6bd198e92c4f30bc84469cc31d8)) - * Add configurable double buffering ([MR \!4026](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4026)) - * Add Muon optimizer and distributed optimizer support ([MR \!4106](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4106)) - * Add setting to support Adam or AdamW optimizer ([MR \!3866](https://github.com/NVIDIA/Megatron-LM/commit/03fd0b41b3840c6f19558161d98373a9242402e5)) - * MoE - * Add DTensor support for EP and DSv3 modules ([MR \!3955](https://github.com/NVIDIA/Megatron-LM/commit/268fda08592528b7bc1a21aadaed259980ca8efb)) - * Add HybridEP backend to Flex Dispatcher ([MR \!4237](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4237)) - * Support FP8 recomputation for MoE components ([MR \!4030](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4030)) - * Implement NVFP4 Zero Padding for MoE ([MR \!4225](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4225)) - * Compute shared experts before router ([MR \!4068](https://github.com/NVIDIA/Megatron-LM/commit/e8024d716f3036ebcef8c5254c7830ad09aaf41b)) - * Enable bias in expert MLP ([MR \!3858](https://github.com/NVIDIA/Megatron-LM/commit/a329dd6da586261a45a8f7d04c1e659ffedd80ae)) - * Model support - * Add YaRN support for GPT-OSS ([MR \!4044](https://github.com/NVIDIA/Megatron-LM/commit/2c1b77a9984bfa978e7cf1f58522e5f8e045d017)) - * Add support for Qwen3-Next arguments ([MR \!4070](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4070)) - * Add FP8 init for MTP ([MR \!3958](https://github.com/NVIDIA/Megatron-LM/commit/d6c6e54ec5eb43d4e196c7ae84e0e88f28613e6b)) - * Add fp8\_dpa option for FP8 scaling ([MR \!4053](https://github.com/NVIDIA/Megatron-LM/commit/61047e60e617e71ebe120ec293b62df6b0efc84f)) - * Add RADIO-g support to converter and tester ([MR \!4371](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4371)) - * Add audio semantic reasoning data for voice chat and speech instructions ([MR \!4397](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4397)) - * FSDP - * Enable joint training of parallel modules ([MR \!3850](https://github.com/NVIDIA/Megatron-LM/commit/53008b844f98886a2144c216ecd25952cb2dda58)) - * Add support for multimodule communication ([MR \!4235](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4235)) - * Inference - * Add CUDA Graph runner lookup table cache (up to 2x E2E speedup) ([MR \!4082](https://github.com/NVIDIA/Megatron-LM/commit/ab43252fdbedcc3662014ae0e110bd3278d844f4)) - * Add MoE dropping and padding router for CUDA Graph \+ decode ([MR \!3816](https://github.com/NVIDIA/Megatron-LM/commit/56818f9e5090ff9eb0f13f10bfe408aae4031c5c)) - * Dynamic audio shapes with variable sequence lengths (2.5x throughput improvement) ([MR \!4274](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4274)) - * Integrate unified memory for dynamic inference context ([MR \!3985](https://github.com/NVIDIA/Megatron-LM/commit/ef4ae4528a0924159069b9f3a2719616156bafa2)) - * Post-training - * Add GPT-OSS ModelOpt support with quantization, import/export ([MR \!4169](https://github.com/NVIDIA/Megatron-LM/commit/a2d8c806b35bc708b13e6c069e19e5dfb49b8481)) - * Enable KD support with hybrid training loop ([MR \!4021](https://github.com/NVIDIA/Megatron-LM/commit/48d7275062a8307f82bd0fa6c1504032c7f3af96)) - * Add ModelOpt pruning example ([MR \!4022](https://github.com/NVIDIA/Megatron-LM/commit/5a58976ebe007064c2ff5e76e815aa5fcf1a8787)) - * RL - * Add importance sampling and partial rollouts to Megatron RL ([MR \!4000](https://github.com/NVIDIA/Megatron-LM/commit/8399280ed3b72a183f44820896a67392c0a47e3e)) - * Add sequence packing for RL ([MR \!4191](https://github.com/NVIDIA/Megatron-LM/commit/ee8e9307f3ad655e6a46f98a483d8192995b02c2)) - * Ease of use - * Handle CUDA absence during import ([MR \!4120](https://github.com/NVIDIA/Megatron-LM/commit/ae44e49271dc45b51a7400ecf6debc598ba90b54)) - * Add granary dataloader functionality ([MR \!4291](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4291)) - * Enable SWA mixing with attention ([MR \!3855](https://github.com/NVIDIA/Megatron-LM/commit/e5bc9249d7ad34355f5db4c8ff7d7a9080f94dc2)) -* Bug fixes - * Fix convergence bug in MXFP8 parameter gradient buffer reuse ([MR \!3999](https://github.com/NVIDIA/Megatron-LM/commit/c2c36f77cf7a0476daee5bb2dec604c2764de320)) - * Fix loss mask cloning to prevent incorrect updates ([MR \!4164](https://github.com/NVIDIA/Megatron-LM/commit/c94d58f3260aa568588265e07b3c06bb58cbde41)) - * Fix metadata loss in checkpoints ([MR \!4182](https://github.com/NVIDIA/Megatron-LM/commit/d8c6aa4c0b5d4c15ec1196802bce292d4580ed4a)) - * Fix FSDP grad accum fusion support ([MR \!4018](https://github.com/NVIDIA/Megatron-LM/commit/9f72f4775509668173c75eaab5d58a49f4473748)) - * Fix non-TE optimizer checkpoint issue ([MR \!3931](https://github.com/NVIDIA/Megatron-LM/commit/2ebb6ee95af8b547e3c0ac394d494cb189b890bc)) - * Fix BERT virtual pipeline parallelism ([MR \!3993](https://github.com/NVIDIA/Megatron-LM/commit/18420b63408101fe5a49d125fb29625f1ad6ab26)) - * Fix gc.freeze() slowdown by adding gc.collect() on last layer ([MR \!4003](https://github.com/NVIDIA/Megatron-LM/commit/a3f9e566c9595753553a73d403b2a481ad283fc0)) - * Fix full iteration CUDA graph non-tensor handling ([MR \!4019](https://github.com/NVIDIA/Megatron-LM/commit/8479eb35fbca9631acb846c3ad5d868e02214227)) - * Fix model\_auto\_sync mis-set and add gradient assertion ([MR \!4062](https://github.com/NVIDIA/Megatron-LM/commit/03045f2d880813695f75707e3262a2bfb4206dfe)) - * Fix HF import dtype and checkpoint loading issues ([MR \!4095](https://github.com/NVIDIA/Megatron-LM/commit/435e7e0620ff870d99debd73b3c9113226622dde)) - * Fix missing initialization in ProcessGroupCollection ([MR \!4159](https://github.com/NVIDIA/Megatron-LM/commit/5f2becf232a85df8687dc539e604e00a6a875da1)) - * Fix sink attention TP ([MR \!4173](https://github.com/NVIDIA/Megatron-LM/commit/3b1b9b267193d72d4f8dc710561c2368de8c114c)) - * Fix num\_microbatches calculation ([MR \!4199](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4199)) - * Fix 1f1b overlap unit tests for MTP standalone ([MR \!4210](https://github.com/NVIDIA/Megatron-LM/commit/44bc753d69cf509c158bb261434498b141fe5130)) - * Fix stale state dict handling ([MR \!4226](https://github.com/NVIDIA/Megatron-LM/commit/0ba847081113a92ce01084f33cd4a0c1f31b327b)) - * Fix dataset divergence with tokenizer PAD handling ([MR \!4231](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4231)) - * Fix parameter initialization ([MR \!4296](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4296)) - * Ensure tensor-parallel attributes set regardless of initialization flag ([MR \!4312](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4312)) -* Known issues - -## NVIDIA Megatron Core 0.14.0 - -* Features - * Inference - * Add async support for DynamicInferenceEngine ([MR \!3187](https://github.com/NVIDIA/Megatron-LM/commit/05079d55a5bfcc7a43f4619e36a40a9e8db3f882)) - * Pad input tensors and enable FP8 weights for FP8 inference ([MR \!3341](https://github.com/NVIDIA/Megatron-LM/commit/6a6cd478839d90cf09a837adf8c79cbc844bc920)) - * Force inference to always gather logits with tensor parallelism ([MR \!3442](https://github.com/NVIDIA/Megatron-LM/commit/7c9cdcb794089968278c7272e0261a68edf5d369)) - * Multi batch size CUDA Graphs for Dynamic Inference ([MR \!3402](https://github.com/NVIDIA/Megatron-LM/commit/30aabe5e3133c6d70aa55aaabad4ea8cb04ce63c)) - * Post-training - * ModelOpt updates ([MR \!3268](https://github.com/NVIDIA/Megatron-LM/commit/550ed5243c3a18e39430c15e8918ee63e41d7eaf)) - * Add speculative decoding AR validation feature - * Add DeepSeek and Qwen model configs - * Performance - * ModelCommProcessGroup integration ([MR \!3391](https://github.com/NVIDIA/Megatron-LM/commit/26adc2dfde53fbc2b063e2fdd1d9ed26578811a6)) - * Add HyperCommGrid: N-Dimensional Communication Grid for Model Parallelism ([MR \!3398](https://github.com/NVIDIA/Megatron-LM/commit/45400df7da7fa23e3aff86804e5ac254d9a8d3c0)) - * Flexible creation and management of communication groups - * Add support for Spike No More embedding initializations and weight decay skipping ([MR \!3500](https://github.com/NVIDIA/Megatron-LM/commit/ee74aa66a06b24e511270f285db475941ef63bfd)) - * MoE - * We're actively optimizing large-scale fine-grained MoE performance on Blackwell Platform. - * Features: - * Support Expert Parallel A2A Overlapping ([MR \!3470](https://github.com/NVIDIA/Megatron-LM/commit/0c6c1176fb3e3e00534b3591f1ad023d4ecad6fb); [MR \!3074](https://github.com/NVIDIA/Megatron-LM/commit/4b30ec54aba97e16a083eca33d2df1dd48e1b48f)) - * Support CP and recompute for MTP ([MR \!3330](https://github.com/NVIDIA/Megatron-LM/commit/650ab87d04105869f197f2ddc441e3b18ca93724)) - * Add support for global aux loss ([MR \!3318](https://github.com/NVIDIA/Megatron-LM/commit/e58d9080ea212e005ccba0b6607bfcc86451285d)) - * Memory Optimization - * Support recomputation for FP8 layernorm/moe\_act/shared\_experts ([MR \!3465](https://github.com/NVIDIA/Megatron-LM/commit/6850cc6a739d168f8c84db6cdacf4fe2931c0c49)) - * Support optimizer offloading for DSV3 FP8 training ([MR \!3659](https://github.com/NVIDIA/Megatron-LM/commit/abbde02f54b62a5194ebe951218e98feceba6d42)) - * Performance Optimization - * Add MoE router fusion ([MR \!3809](https://github.com/NVIDIA/Megatron-LM/commit/d93743a9f11d5d17824b8b49868cc90f2904896f)) - * Updates for MoE cudagraph ([MR \!3631](https://github.com/NVIDIA/Megatron-LM/commit/95452706d7aa16dc174813e12639a8c8356fbe87)) - * Bug fixes: - * Fix router input jitter dtype ([MR \!3774](https://github.com/NVIDIA/Megatron-LM/commit/20b395424d2e2bbfaab57b2f954294eb57c90c82)) - * Model support - * Add MiMo video VLM train example ([MR \!3543](https://github.com/NVIDIA/Megatron-LM/commit/786f5629d3462aff2f8855f51db70e882c475116)) - * Add AVLM for MIMO ([MR \!3624](https://github.com/NVIDIA/Megatron-LM/commit/db41707430bff743f986b5779712c74242b99caa)) - * Ease of use - * Add uv support for source installs ([MR \!3615](https://github.com/NVIDIA/Megatron-LM/commit/164204cd7216e642bdef7299c569d95f02f9a79e)) - * Automated weekly prereleases ([MR \!3574](https://github.com/NVIDIA/Megatron-LM/commit/7e59266c70ef34a246438640af690b55c7ecac28)) -* Bug fixes - * Use mscale\_all\_dim for softmax\_factor ([MR \!2800](https://github.com/NVIDIA/Megatron-LM/commit/e96a358f60c82b8ac8d965d91c3cc4ad0230a4e0)) - * Fix FP8 param blockwise scaling unit test ([MR \!3480](https://github.com/NVIDIA/Megatron-LM/commit/57082f946a04c3390fcfc43634dc546ec3ded033)) - * Fix unit test blockwise scaling ([MR \!3491](https://github.com/NVIDIA/Megatron-LM/commit/6d95fe63658f967e56a3fda88a9c30a424fcb520)) - * Optimize prefill for token-less requests ([MR \!3499](https://github.com/NVIDIA/Megatron-LM/commit/daaa650a9ac4291d4027ca2fdeb4298ce024efd2)) - * Add default values for Fp8Padding and Fp8Unpadding ([MR \!3501](https://github.com/NVIDIA/Megatron-LM/commit/42b2b1d10a9cb699b7e5aa40f6bfba9c2a1348aa)) - * Fix CUDA graph logic for flexible pp layout ([MR \!3505](https://github.com/NVIDIA/Megatron-LM/commit/020d85e50ddf0f0282802002acb3662129a519c5)) - * Load FP8 models with strict=False ([MR \!3508](https://github.com/NVIDIA/Megatron-LM/commit/1ab876ddc4c1893c76f26d775226a8d1dcdfb3d2)) - * Skip rope check for torch \< 1.4.0 ([MR \!3528](https://github.com/NVIDIA/Megatron-LM/commit/d8180ef8ed0bb6f305dcdedf1b27d91304f361a3)) - * Disable Apex tests for stability ([MR \!3539](https://github.com/NVIDIA/Megatron-LM/commit/d1256277fe378add0a2cfd7251f5a350b6d126ec)) - * Fix typo in parallel\_state expert parallelism ([MR \!3548](https://github.com/NVIDIA/Megatron-LM/commit/5783ff32af759b8102cf0cb0bb82b30c48b9da26)) - * Guard modelopt on macOS ([MR \!3549](https://github.com/NVIDIA/Megatron-LM/commit/76144fe1106e4fb0e69aa75b7a6ab66e71e8f37f)) - * Retry on CUDA function failure ([MR \!3554](https://github.com/NVIDIA/Megatron-LM/commit/809aab68307a64c1386d68cc78ef70f8f4e12a80)) - * Fix NCCL mem pool creation error ([MR \!3557](https://github.com/NVIDIA/Megatron-LM/commit/b61e21153146a563309b5d44cb5d7f7425806072)) - * Fix get\_rotary\_seq\_len return type ([MR \!3559](https://github.com/NVIDIA/Megatron-LM/commit/1fa6bc83c7aeae95abc8e86ff0aac596985a01c3)) - * Retry on CUDA function failure ([MR \!3560](https://github.com/NVIDIA/Megatron-LM/commit/7da88d74865c3f1a59894173246f26e7b3bf91b9)) - * Fix NCCL allocator attribute error ([MR \!3565](https://github.com/NVIDIA/Megatron-LM/commit/6b656114795d74c3353cb007c59af49b1752f447)) - * Ensure multi-prompt inference works ([MR \!3568](https://github.com/NVIDIA/Megatron-LM/commit/0fae48931000c9c7af06f7dcf037b5b7d96e0cd6)) - * Fix MD5 on FIPS systems ([MR \!3577](https://github.com/NVIDIA/Megatron-LM/commit/83ee8c2848a3b1d42b40086a64da11e19f4b191f)) - * Fixes dynamic context and inference bugs ([MR \!3582](https://github.com/NVIDIA/Megatron-LM/commit/e9c1da60a1ccc85376666d58568ed1d3e5a4f9db)) - * Fix TE version for interleaved fused RoPE ([MR \!3586](https://github.com/NVIDIA/Megatron-LM/commit/b72b6cc161f5273b545bca09677382917cf20492)) - * Fix MTP with MoE and TP logging ([MR \!3594](https://github.com/NVIDIA/Megatron-LM/commit/9af96623b66693e058f6bfce8d0094dc976792d8)) - * Guard TE import fix ([MR \!3596](https://github.com/NVIDIA/Megatron-LM/commit/1bf946b1ec3f11e71459c7c0d06a97edbed96a1a)) - * Add assertion for NCCL UB case ([MR \!3599](https://github.com/NVIDIA/Megatron-LM/commit/e11d28592f19c122859be764b7afe7c208d9acc1)) - * Remove Encoder PP related Functions ([MR \!3604](https://github.com/NVIDIA/Megatron-LM/commit/9e49aa4446a58cc21c4dc0c5d0806551ad075ca7)) - * Fix segfaults in tests ([MR \!3605](https://github.com/NVIDIA/Megatron-LM/commit/f6492fe8164fd5b9ad55007d435ccfc66cb98cc7)) - * Fix TE error in distributed optimizer ([MR \!3625](https://github.com/NVIDIA/Megatron-LM/commit/e6c510ff3c1159f8955589b26f7c395bdf0607d9)) - * Remove redundant barrier in checkpoint flow ([MR \!3626](https://github.com/NVIDIA/Megatron-LM/commit/26869feb6a3ac7f5616cb7253c37a4244d107d70)) - * Support VPP MTP, fix logging ([MR \!3630](https://github.com/NVIDIA/Megatron-LM/commit/c351a473c7eedac2c43eab0815afb9759f4f8187)) - * Retry mechanism for free(): invalid pointer errors ([MR \!3632](https://github.com/NVIDIA/Megatron-LM/commit/ec35b41b2df145a7ccb84afc48d94e0786e094da)) - * Fix test\_replication.py issues ([MR \!3633](https://github.com/NVIDIA/Megatron-LM/commit/f7b50b271b2e0e396069e02551b21aa6fb374b43)) - * Fix typo in parallel\_state ([MR \!3634](https://github.com/NVIDIA/Megatron-LM/commit/3c79a2c330290df58804c33e28e7c197fcc1f0b9)) - * Fix CUDA graph logic determination ([MR \!3635](https://github.com/NVIDIA/Megatron-LM/commit/90efa3ef8a3c4f9e0f1db9f67ab9348bfa501387)) - * Fix TE installation error ([MR \!3636](https://github.com/NVIDIA/Megatron-LM/commit/7e7322c01c9cb8ec254ecd9042700b22b70fe5c8)) - * Ensure correct sharding type in local tests ([MR \!3643](https://github.com/NVIDIA/Megatron-LM/commit/946357f8dd7fdc12424b3a66bc999e6c0a02696c)) - * Fix cudagraphed backward buffer reuse for last layer ([MR \!3645](https://github.com/NVIDIA/Megatron-LM/commit/ee61cf450d24760952e8995aab045ab6d55b986e)) - * Set default for packed\_seq\_params in get\_rotary\_seq\_len ([MR \!3651](https://github.com/NVIDIA/Megatron-LM/commit/510d58c46664f44c556005ac928c5c531e12f761)) - * Fix dynamic example script errors ([MR \!3653](https://github.com/NVIDIA/Megatron-LM/commit/72e290bf1f4bbf0c8047bb10a51da6ea6372e163)) - * Guard TE import fix ([MR \!3666](https://github.com/NVIDIA/Megatron-LM/commit/ac198fc0d60a8c748597e01ca4c6887d3a7bcf3d)) -* Breaking changes: - * `megatron.core.distributed.custom_fsdp` refactored as breaking change to `megatron.core.distributed.fsdp.src.megatron_fsdp` -* Known issues - -## NVIDIA Megatron Core 0.13.0 - -* Support bf16 dtype for optimizer states to use precision-aware optimizer in TransformerEngine -* MoE - * Features: - * Flexible Asymmetric Virtual Pipeline Parallelism with Custom Pipeline Layout (--pipeline-model-parallel-layout) - * Add support to pass custom parallelism groups to MoE modules. - * Add Hybrid Shard Data-Parallel support for MoE models (--num-distributed-optimizer-instances) - * Support EP \+ custom FSDP training for DeepSeek-V3 - * FP8 support for Multi-Token-Prediction - * Memory Optimization - * Fine-grained recomputation to reduce activation memory. (--recompute-modules with \--recompute-granularity selective) - * Memory efficient token permutation by moving the probs multiplication from unpermutation to activation function of GroupedMLP. - * Performance Optimization - * MLA RoPE fusion kernel and YARN embedding cache. - * FP8 padding optimization of MoE models by padding the routing map. - * Bug fixes: - * Fix the aux loss calculation when expert\_bias or group limited routing is used. This leads to load\_balancing\_loss values change compared to the previous version. - * Fix packed sequence support for MLA - * Known Issues: - * MTP is not compatible with flexible pipeline layout, will be fixed at \!3594. - * MTP convergence issue with TP2, will be fixed at \!3594. - -## NVIDIA Megatron Core 0.12.0 - -* Add FP8 recipe selection to arguments (--fp8-recipe, --first-last-layers-bf16, --num-layers-at-start-in-bf16, --num-layers-at-end-in-bf16) -* Context parallel: fix loss scaling when calculate_per_token_loss=True -* Make the number of data parallel communication buckets configurable (--ddp-num-buckets, --ddp-pad-buckets-for-high-nccl-busbw) -* Inference - * Support in-flight batching and chunked KV cache - * Reduce memory usage, - * by not materializing full attention mask - * by only materializing logits for the last token during decode - * by removing an obsolete tensor reference -* Hybrid Model - * Inference - * Add CUDA graph support - * Change tools/run_mamba_text_generation_server.py to use megatron.core.inference - * Fix a shape issue when materializing logits for Mamba model - * Improve initialization of Mamba layers - * Add configuration switches (--mamba-state-dim, --mamba-head-dim, --mamba-num-groups, --is-hybrid-model) - * Make num_floating_point_operations work with hybrid model - * Make hybrid_conversion.py work with mixer that uses TE linear - * Add FP8 support - * Fix Mamba dt_bias tensor parallelism - * Support multimodal tokenizer - * Improve data parallelism scaling -* MoE - * Features: - * DeepEP support, compatible with all the parallelisms and token drop / dropless - * Important precision improvement: Enable FP32/FP64 routing and unpermutation using –moe-router-dtype. FP32 is recommended for all fine-grained MoE training - * CUDA Graph support for MoE - * Multi-Token Prediction (MTP) Support - * Fused indices_to_multihot kernel for DeepEP dispatcher - * Bug fixes: - * Fix Hang Issue with MoE+Dense Hybrid models - * Update theoretical memory and tflops estimation for MoE and MLA - * Fix MoE Aux loss scaling for per token loss - * Fixes for group limited routing and expert bias. We verified these fixes through dsv3 e2e verifications - * Known issues: - * The ckpt trained with Custom FSDP for MoE may not be compatible with 3D parallel training. - -## NVIDIA Megatron Core 0.11.0 - -* Add multi datacenter training support though N/S connection -* MoE - * Features - * Support DeepSeek-V3 fine-tuning - * Aux-loss-free load balancing strategy - * Node-limited routing and Device-limited routing support. - * Tensor Parallelism support for MLA and Sequence Auxiliary Loss - * MTP (with TP and PP support) is coming soon. - * Permutation / Unpermutation fusion kernel from TransformerEngine. - * Uneven virtual pipeline parallel split support in first and last PP stage. - * Bug fixes: - * Fix the grad scale when TP != expert-TP and average_in_collective is enabled in DDP. - * Fix TEGroupedMLP distckpt compatibility issue with FP8 padding/unpadding. - * Known Issues: - * When training the Dense+MoE hybrid model, the process will hang if any PP rank does not have expert params. -* Add MX-FP16 support for optimizer and master weights -* CUDA Graph memory optimizations -* Enable UCC backend for PP communication -* Optimizer CPU offload support for memory savings -* Models - * Initial RADIO/CRADIO implementation - * llama3.2 support -* Hybrid Model - * Support quantization via TensorRT Model Optimizer - -## NVIDIA Megatron Core 0.10.0 - -* Adding MLA to MCore -* Enable FP8 for GroupedMLP -* MoE Parallel Folding -* Enhance MoE Architecture: Support MoE Layer Frequency Patterns and Configurable MoE FFN Hidden Size -* Multimodal: NVLM training and evaluation support in MCore -* Mamba Hybrid - * Increase performance and reduce memory footprint of Triton language/compiler distributed caching - * Add more unit testing and fix bugs - -## NVIDIA Megatron Core 0.9.0 - -* Uneven pipeline parallelism - * Enable pipeline parallelism where first and last ranks have fewer transformer layers than the intermediate ranks -* Per layer CUDAGraph support for GPT training with Transformer Engine modules -* Enable different TP sizes for the vision encoder -* Enable pipeline parallelism for T5 & Llava models -* Support multi-tile multi-image input in Llava models -* MoE - * FP8 support - * Runtime upcycling support - * Dispatcher implementation optimizations - * Shared expert support with overlapping optimizations - * Qwen Model support -* Known Issues - * When using sequence parallel, during the transformer block forward pass, dropout is not using the appropriate rng context. -* NVRx / Fault tolerance - * fault and hang detection in addition to existing straggler detection - * graceful exit and auto restart - -## NVIDIA Megatron Core 0.8.0 - -* Multimodal - * Added initial support for training vision language models using the LLaVA architecture - * Added initial support for inference with multimodal inputs - * End-to-end multimodal example from data collection to training to evaluation is provided in examples/multimodal -* MoE - * Context Parallel support. - * Distributed checkpoint support for grouped GEMM. -* Mamba - -## NVIDIA Megatron Core 0.7.0 - -* MoE - * Token drop support - * Several efficiency optimizations - * Improved model parallelism - * Memory optimizations -* Distributed checkpointing - * Enabled for Retro - * Asynchronous checkpoint saving -* Several minor bug fixes, speed improvements, and memory optimizations - -## NVIDIA Megatron Core 0.6.0 - -* MoE (Mixture of Experts) - * Performance optimization - * Communication optimization for multi GPU and Single GPU - * 23% improvement (323 TFLOPS/GPU) over MCore 0.5.0 on Mixtral with Hopper BF16 - * GroupedMLP enhancement for Hopper - * DP Overlapping. Support overlapping computation with gradient reduction and parameter gathering. - * All-to-All based Token Dispatcher - * Layer-wise logging for load balancing loss. - * Improved expert parallel support including distributed optimizer. -* Distributed optimizer -* RETRO - * Data processing -* BERT - * Distributed checkpointing -* Dist checkpointing - * PyTorch native distributed backend - * Improved saving/loading speed -* TensorRT-LLM Export - * Integration with TensorRT Model Optimizer Post-training quantization (PTQ) - * Text generation driver to perform PTQ in Megatron-LM - * Llama2 and Nemotron3-8b examples to use TensorRT-LLM unified build API to build engine after training. -* Several minor enhancements, bug fixes, and documentation updates - -## NVIDIA Megatron Core 0.5.0 - -### Key Features and Enhancements - -Megatron core documentation is now [live!](https://docs.nvidia.com/megatron-core/developer-guide/latest/user-guide/index.html#quick-start) - -### Model Features - -* MoE (Mixture of Experts) - * Support for Z-loss, Load balancing and Sinkhorn - * Layer and communications refactor - * Richer parallelism mappings and EP can be combined with other model parallel techniques for larger MoE variants, e.g. EP + TP + DP + SP + PP - * Token dropless architecture with Top-K routing - * Performance optimization with with GroupedGEMM when number of local experts is > 1 - * Distributed checkpointing -* Interleaved rotary embedding - -### Datasets - -* Masked WordPiece datasets for BERT and T5 -* Raw and mock datasets - -### Parallelism - -### Performance - -* Activation offloading to CPU -* Rope and Swiglu fusion -* Sliding window attention (via Transformer Engine) - -### General Improvements - -* Timers - -## NVIDIA Megatron Core 0.4.0 - -### Key Features and Enhancements - -#### Models - -* BERT -* RETRO -* T5 - -#### Parallelism - -* Mixture of Experts support for GPT -* Model parallel efficient Distributed Data Parallel (DDP) -* Context Parallel (2D Tensor Parallel) support - -#### Datasets - -* GPT Dataset -* Blended Dataset diff --git a/docs/add_copyright_header.py b/docs/add_copyright_header.py new file mode 100644 index 00000000000..9bc4481c506 --- /dev/null +++ b/docs/add_copyright_header.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#!/usr/bin/env python3 +"""One-off script to add NVIDIA copyright header to all .md files under docs/.""" + +from pathlib import Path + +HEADER = """ Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved. + NVIDIA CORPORATION and its licensors retain all intellectual property + and proprietary rights in and to this software, related documentation + and any modifications thereto. Any use, reproduction, disclosure or + distribution of this software and related documentation without an express + license agreement from NVIDIA CORPORATION is strictly prohibited. + +""" + +def main(): + docs_dir = Path(__file__).resolve().parent + already_has = "Copyright (c) 2022-2026, NVIDIA CORPORATION" + count = 0 + for path in sorted(docs_dir.rglob("*.md")): + content = path.read_text(encoding="utf-8") + if content.strip().startswith(already_has): + continue + new_content = HEADER + content + path.write_text(new_content, encoding="utf-8") + count += 1 + print(path.relative_to(docs_dir)) + print(f"\nUpdated {count} files.") + +if __name__ == "__main__": + main() diff --git a/docs/advanced/index.md b/docs/advanced/index.md index 573cb0ee81a..98ff0806cff 100644 --- a/docs/advanced/index.md +++ b/docs/advanced/index.md @@ -1,3 +1,12 @@ + + # Discussions In-depth technical discussions and optimization guides: diff --git a/docs/api-backwards-compatibility-check.md b/docs/api-backwards-compatibility-check.md index a417abfc2df..e1b6939b06f 100644 --- a/docs/api-backwards-compatibility-check.md +++ b/docs/api-backwards-compatibility-check.md @@ -1,3 +1,12 @@ + + # API Backward Compatibility Checking ## Overview diff --git a/docs/api-guide/core/datasets.md b/docs/api-guide/core/datasets.md index e97e99ae1db..d80c0183375 100644 --- a/docs/api-guide/core/datasets.md +++ b/docs/api-guide/core/datasets.md @@ -1,3 +1,12 @@ + + # datasets package ```{include} ../../../megatron/core/datasets/readme.md diff --git a/docs/api-guide/core/dist_checkpointing.md b/docs/api-guide/core/dist_checkpointing.md index 959aa4b07e0..ee0e5562ef3 100644 --- a/docs/api-guide/core/dist_checkpointing.md +++ b/docs/api-guide/core/dist_checkpointing.md @@ -1,3 +1,12 @@ + + # dist_checkpointing package A library for saving and loading the distributed checkpoints. @@ -32,19 +41,62 @@ import torch, argparse torch.serialization.add_safe_globals([argparse.Namespace]) ``` -Checkpointing Distributed Optimizer ------------------------------------ +## Checkpointing Distributed Optimizer -Checkpoint Compatibility and Optimizer State Formats -#################################################### +### Checkpoint Compatibility and Optimizer State Formats Beginning with **mcore v0.14**, the ``flattened_range`` attribute was removed from ``dist_checkpointing``. As a result: -- Optimizer states saved with mcore versions < 0.14 are no longer loadable. Loading these legacy optimizer states is not supported because the required sharded metadata is no longer available. -- Model weights from older checkpoints remain fully compatible. No additional work is required—model weights from checkpoints produced by earlier versions are loaded automatically. +- Optimizer states saved with mcore versions <= 0.14 can no longer be loaded directly. Loading these legacy optimizer states is not supported because the required sharded metadata is no longer available. If you need to continue training from older checkpoints, refer to the workaround described below. +- Model weights from older checkpoints remain fully compatible. No extra steps are needed—model weights from checkpoints created by earlier versions load automatically; simply add the ``--no-load-optim`` flag. + +### Workaround: Loading legacy optimizer states with ToT MCore + +**Step 1: Convert the legacy checkpoint using mcore v0.15.0** + +Run a dummy training job with mcore v0.15.0 to re-save the checkpoint with new optimizer states format. + +```bash +MODEL_TRAIN_PARAMS=( + # Define model architecture and training parameters here +) +OLD_CKPT=/workspace/mcore_ckpt_old +CONVERTED_CKPT=/workspace/mcore_ckpt_0.15.0 + +torchrun --nproc_per_node=8 /opt/megatron-lm/pretrain_gpt.py \ + --save-interval 1 \ + --eval-interval 1 \ + --exit-interval 1 \ + --eval-iters 1 \ + --use-distributed-optimizer \ + --save ${CONVERTED_CKPT} \ + --load ${OLD_CKPT} \ + --ckpt-format torch_dist \ + "${MODEL_TRAIN_PARAMS[@]}" +``` + +**Step 2: Load the converted checkpoint with ToT MCore** + +Use the converted checkpoint as the input for continued training with ToT MCore. + +```bash +MODEL_TRAIN_PARAMS=( + # Define model architecture and training parameters here +) +NEW_CKPT=/workspace/mcore_ckpt_new +CONVERTED_CKPT=/workspace/mcore_ckpt_0.15.0 + +torchrun --nproc_per_node=8 /opt/megatron-lm/pretrain_gpt.py \ + --use-distributed-optimizer \ + --save ${NEW_CKPT} \ + --load ${CONVERTED_CKPT} \ + --ckpt-format torch_dist \ + "${MODEL_TRAIN_PARAMS[@]}" +``` + +After this step, training can proceed normally using ToT MCore with fully supported optimizer state loading. -Distributed Optimizer Checkpoint Formats -######################################## +## Distributed Optimizer Checkpoint Formats The refactor of the Distributed Optimizer introduces **two checkpoint formats**: @@ -57,8 +109,7 @@ The refactor of the Distributed Optimizer introduces **two checkpoint formats**: - Slower than dp_reshardable. - Enabled via the ``--dist-ckpt-optim-fully-reshardable`` flag. -Workflow for Changing Model Parallelism -####################################### +### Workflow for Changing Model Parallelism You can combine formats to optimize both flexibility and performance: diff --git a/docs/api-guide/core/dist_checkpointing.strategies.md b/docs/api-guide/core/dist_checkpointing.strategies.md index 7aab8609504..22fe3517a54 100644 --- a/docs/api-guide/core/dist_checkpointing.strategies.md +++ b/docs/api-guide/core/dist_checkpointing.strategies.md @@ -1,3 +1,12 @@ + + # dist_checkpointing.strategies package Package defining different checkpoint formats (backends) and saving/loading algorithms (strategies). diff --git a/docs/api-guide/core/distributed.md b/docs/api-guide/core/distributed.md index 1921c0bdd57..13da4285ec5 100644 --- a/docs/api-guide/core/distributed.md +++ b/docs/api-guide/core/distributed.md @@ -1,3 +1,12 @@ + + # distributed package This package contains various utilities to finalize model weight gradients diff --git a/docs/api-guide/core/fusions.md b/docs/api-guide/core/fusions.md index 396280ad7da..fdd358e813c 100644 --- a/docs/api-guide/core/fusions.md +++ b/docs/api-guide/core/fusions.md @@ -1,3 +1,12 @@ + + # fusions package This package provides modules that provide commonly fused diff --git a/docs/api-guide/core/index.md b/docs/api-guide/core/index.md index 150fd72cb1e..0d39e46e744 100644 --- a/docs/api-guide/core/index.md +++ b/docs/api-guide/core/index.md @@ -1,3 +1,12 @@ + + # Core APIs Low-level API reference for core Megatron components. diff --git a/docs/api-guide/core/pipeline_parallel.md b/docs/api-guide/core/pipeline_parallel.md index 42fac8cc449..35f3c5b5cc2 100644 --- a/docs/api-guide/core/pipeline_parallel.md +++ b/docs/api-guide/core/pipeline_parallel.md @@ -1,3 +1,12 @@ + + # pipeline_parallel package This package contains implementations for two different pipeline parallelism diff --git a/docs/api-guide/core/tensor_parallel.md b/docs/api-guide/core/tensor_parallel.md index 33a9160c82b..2d41c5f4467 100644 --- a/docs/api-guide/core/tensor_parallel.md +++ b/docs/api-guide/core/tensor_parallel.md @@ -1,3 +1,12 @@ + + # tensor_parallel package This package contains an implementation for tensor parallelism in transformer diff --git a/docs/api-guide/core/transformer.md b/docs/api-guide/core/transformer.md index d004381844b..622fb006f99 100644 --- a/docs/api-guide/core/transformer.md +++ b/docs/api-guide/core/transformer.md @@ -1,3 +1,12 @@ + + # transformer package The `transformer` package provides a customizable and configurable diff --git a/docs/api-guide/index.md b/docs/api-guide/index.md index 851114d98e8..7afa2450dd0 100644 --- a/docs/api-guide/index.md +++ b/docs/api-guide/index.md @@ -1,3 +1,12 @@ + + # API Guide API reference documentation for Megatron Core components. diff --git a/docs/api-guide/internal/index.md b/docs/api-guide/internal/index.md index c216a976c77..312081ce70b 100644 --- a/docs/api-guide/internal/index.md +++ b/docs/api-guide/internal/index.md @@ -1,3 +1,12 @@ + + # Internal Utilities Internal utility APIs. diff --git a/docs/api-guide/internal/num_microbatches_calculator.md b/docs/api-guide/internal/num_microbatches_calculator.md index 470c9e49128..0c223588ce5 100644 --- a/docs/api-guide/internal/num_microbatches_calculator.md +++ b/docs/api-guide/internal/num_microbatches_calculator.md @@ -1,3 +1,12 @@ + + # Microbatches Calculator This api is used to calculate the number of microbatches required to fit a given model on a given batch size. diff --git a/docs/api-guide/internal/optimizer_param_scheduler.md b/docs/api-guide/internal/optimizer_param_scheduler.md index 13e1f77ccc0..45e5e4b7da1 100644 --- a/docs/api-guide/internal/optimizer_param_scheduler.md +++ b/docs/api-guide/internal/optimizer_param_scheduler.md @@ -1,3 +1,12 @@ + + # Optimizer Parameters Scheduler This api is used to calculate the learning rate and weight decay for the optimizer. diff --git a/docs/api-guide/models/index.md b/docs/api-guide/models/index.md index c6279d2409a..e5bb531454b 100644 --- a/docs/api-guide/models/index.md +++ b/docs/api-guide/models/index.md @@ -1,3 +1,12 @@ + + # Model APIs API reference for Megatron Core model implementations. diff --git a/docs/api-guide/models/models.bert.md b/docs/api-guide/models/models.bert.md index 3c53027c7c9..1543f4df865 100644 --- a/docs/api-guide/models/models.bert.md +++ b/docs/api-guide/models/models.bert.md @@ -1,3 +1,12 @@ + + # models.bert package Useful package for training bert and bert like encoder only models. It optionally comes with a binary head that can be used for classification tasks . diff --git a/docs/api-guide/models/models.gpt.md b/docs/api-guide/models/models.gpt.md index a7c254d348b..1c3cbb5484c 100644 --- a/docs/api-guide/models/models.gpt.md +++ b/docs/api-guide/models/models.gpt.md @@ -1,3 +1,12 @@ + + # models.gpt package This is the implementation of the popular GPT model. It supports several features like model parallelization (Tensor Parallel, Pipeline Parallel, Data Parallel) , mixture of experts, FP8 , Distributed optimizer etc. We are constantly adding new features. So be on the lookout or raise an issue if you want to have something added. diff --git a/docs/api-guide/models/models.md b/docs/api-guide/models/models.md index 69dfc80211d..a633546f0c9 100644 --- a/docs/api-guide/models/models.md +++ b/docs/api-guide/models/models.md @@ -1,3 +1,12 @@ + + # models package This package contains most of the popular LLMs . Currently we have support for GPT, Bert, and T5 . This is an ever growing list so keep an eye out. diff --git a/docs/api-guide/models/models.t5.md b/docs/api-guide/models/models.t5.md index 90952096b63..4694b80113a 100644 --- a/docs/api-guide/models/models.t5.md +++ b/docs/api-guide/models/models.t5.md @@ -1,2 +1,11 @@ + + # models.t5 package diff --git a/docs/api-guide/router_replay.md b/docs/api-guide/router_replay.md index 300a50db127..4c1dc98b2b6 100644 --- a/docs/api-guide/router_replay.md +++ b/docs/api-guide/router_replay.md @@ -1,3 +1,12 @@ + + # Design Document: MoE Router Replay Feature ### 1. Overview diff --git a/docs/conf.py b/docs/conf.py index a64da441084..84ed89753f5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,9 +25,9 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = "Megatron Core" -copyright = "2025, NVIDIA Corporation" +copyright = "2026, NVIDIA Corporation" author = "NVIDIA Corporation" -release = "latest" +release = "nightly" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/developer/contribute.md b/docs/developer/contribute.md index 859b5562f4b..aeb785f915d 100644 --- a/docs/developer/contribute.md +++ b/docs/developer/contribute.md @@ -1,3 +1,12 @@ + + # Contributing to Megatron-LM This document outlines the processes and policies for issues and pull requests by non-NVIDIA contributors to the Megatron-LM GitHub repository. diff --git a/docs/developer/generate_docs.md b/docs/developer/generate_docs.md index 52fa288122d..d985f542caa 100644 --- a/docs/developer/generate_docs.md +++ b/docs/developer/generate_docs.md @@ -1,3 +1,12 @@ + + # Generating Docs Locally To generate docs locally, use the following commands: diff --git a/docs/developer/oncall.md b/docs/developer/oncall.md index b88da7bb6df..ee5582ca24d 100644 --- a/docs/developer/oncall.md +++ b/docs/developer/oncall.md @@ -1,3 +1,13 @@ + +--> + # Oncall Overview During your oncall week, you will be assigned to all PRs marked “Ready for diff --git a/docs/developer/submit.md b/docs/developer/submit.md index a096312d21e..a46df22f85c 100644 --- a/docs/developer/submit.md +++ b/docs/developer/submit.md @@ -1,3 +1,12 @@ + + # How to Submit a PR ## Step 1: Add PR label `Expert Review` diff --git a/docs/discussions/README.md b/docs/discussions/README.md index 4ac3c4e3254..b95300649d1 100644 --- a/docs/discussions/README.md +++ b/docs/discussions/README.md @@ -1,3 +1,12 @@ + + # Megatron Discussions This directory contains in-depth guides, tutorials, and discussions about optimizing and using Megatron for various use cases. diff --git a/docs/discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md b/docs/discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md index c2354ad07f0..5ef71043f86 100644 --- a/docs/discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md +++ b/docs/discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md @@ -1,3 +1,12 @@ + + # Megatron-FSDP User Guide ## Table of Contents diff --git a/docs/documentation.md b/docs/documentation.md index 16fbd7b9a7e..652a142aec9 100644 --- a/docs/documentation.md +++ b/docs/documentation.md @@ -1,3 +1,12 @@ + + --- orphan: true --- diff --git a/docs/get-started/install.md b/docs/get-started/install.md index dd000500f58..e1d7202b3fc 100644 --- a/docs/get-started/install.md +++ b/docs/get-started/install.md @@ -1,3 +1,12 @@ + + # Megatron Core Installation Installation is supported using Docker and pip. diff --git a/docs/get-started/overview.md b/docs/get-started/overview.md index 883f40e0c61..42b964d5cec 100644 --- a/docs/get-started/overview.md +++ b/docs/get-started/overview.md @@ -1,3 +1,12 @@ + + # Overview Megatron-Core and Megatron-LM are open-source tools that are typically used together to train LLMs at scale across GPUs. Megatron-Core expands the capability of Megatron-LM. Megatron Bridge connects Megatron-Core and Megatron-LM to other popular training models, such as Hugging Face. diff --git a/docs/get-started/quickstart.md b/docs/get-started/quickstart.md index 61868e7877c..2addcb519a2 100644 --- a/docs/get-started/quickstart.md +++ b/docs/get-started/quickstart.md @@ -1,3 +1,12 @@ + + # Quick Start ## Quick Installation diff --git a/docs/get-started/releasenotes.md b/docs/get-started/releasenotes.md index e2d77cf0070..e624de19f15 100644 --- a/docs/get-started/releasenotes.md +++ b/docs/get-started/releasenotes.md @@ -1,3 +1,12 @@ + + # Release Notes diff --git a/docs/index.md b/docs/index.md index 448a75e4c93..c68f608a73b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,3 +1,12 @@ + + # Megatron Core User Guide **Megatron Core** is a GPU-optimized library for training large language models at scale. It provides modular, composable building blocks for creating custom training frameworks with state-of-the-art parallelism strategies and performance optimizations. @@ -6,13 +15,13 @@ Megatron Core offers a flexible, reusable foundation for building large-scale tr ## Key Features -* Composable transformer building blocks (attention, MLP, etc.) +* Composable transformer building blocks (attention, MLP) * Advanced parallelism strategies (TP, PP, DP, EP, CP) * Pipeline schedules and distributed optimizers * Mixed precision support (FP16, BF16, FP8) * GPU-optimized kernels and memory management * High-performance dataloaders and dataset utilities -* Model architectures (LLaMA, Qwen, DeepSeek, GPT, Mamba, etc.) +* Model architectures (LLaMA, Qwen, DeepSeek, GPT, Mamba) ```{toctree} @@ -81,16 +90,16 @@ developer/generate_docs ```{toctree} :maxdepth: 2 :hidden: -:caption: Discussions +:caption: API Reference -advanced/index +api-guide/index +apidocs/index.rst ``` ```{toctree} :maxdepth: 2 :hidden: -:caption: API Reference +:caption: Resources -api-guide/index -apidocs/index.rst +advanced/index ``` \ No newline at end of file diff --git a/docs/llama_mistral.md b/docs/llama_mistral.md index a79bb2c4bf9..076409cd4f5 100644 --- a/docs/llama_mistral.md +++ b/docs/llama_mistral.md @@ -1,3 +1,12 @@ + + # Llama, Mistral and other Llama-like model support in Megatron-LM NOTE: In order to simplify code we now only support converting llama-3.x and mistral checkpoints downloaded from Hugging Face. For converting other models, see [Megatron Bridge](models/index.md). diff --git a/docs/models/index.md b/docs/models/index.md index 6fabd1f582c..0ee379b01bd 100644 --- a/docs/models/index.md +++ b/docs/models/index.md @@ -1,3 +1,12 @@ + + # Supported Models Megatron Core supports a wide range of language and multimodal models with optimized implementations for large-scale training. diff --git a/docs/models/llms.md b/docs/models/llms.md index 6789a4c551c..f649673a2cc 100644 --- a/docs/models/llms.md +++ b/docs/models/llms.md @@ -1,3 +1,12 @@ + + # Language Models Megatron Core supports the following language model architectures for large-scale training. diff --git a/docs/models/multimodal.md b/docs/models/multimodal.md index 66ed8ccd9cb..dce977e261d 100644 --- a/docs/models/multimodal.md +++ b/docs/models/multimodal.md @@ -1,3 +1,12 @@ + + # Multimodal Models Megatron Core supports multimodal models that combine language with vision, audio, and other modalities for comprehensive multimodal understanding. diff --git a/docs/project.json b/docs/project.json index aa547ddd298..d5b9535338b 100644 --- a/docs/project.json +++ b/docs/project.json @@ -1,2 +1,2 @@ -{"name": "megatron-lm", "version": "latest"} +{"name": "megatron-lm", "version": "nightly"} diff --git a/docs/user-guide/data-preparation.md b/docs/user-guide/data-preparation.md index 3ff5eedba89..18da2d80fe1 100644 --- a/docs/user-guide/data-preparation.md +++ b/docs/user-guide/data-preparation.md @@ -1,3 +1,12 @@ + + # Data Preparation Preparing your data correctly is essential for successful training with Megatron Core. diff --git a/docs/user-guide/features/context_parallel.md b/docs/user-guide/features/context_parallel.md index 841c16326b3..c44366187be 100644 --- a/docs/user-guide/features/context_parallel.md +++ b/docs/user-guide/features/context_parallel.md @@ -1,3 +1,12 @@ + + # context_parallel package ## Context parallelism overview diff --git a/docs/user-guide/features/custom_fsdp.md b/docs/user-guide/features/custom_fsdp.md index 2f81eb0c5ef..ab1a1efc402 100644 --- a/docs/user-guide/features/custom_fsdp.md +++ b/docs/user-guide/features/custom_fsdp.md @@ -1,3 +1,12 @@ + + # Megatron FSDP **NOTE: In M-Core 0.14, the custom FSDP refactored its checkpoint implementation to use DTensor-based torch distributed checkpointing. The custom FSDP was also renamed Megatron FSDP. The relevant sections of this document are no longer applicable.** diff --git a/docs/user-guide/features/dist_optimizer.md b/docs/user-guide/features/dist_optimizer.md index ddb6079885c..4e47791c12f 100644 --- a/docs/user-guide/features/dist_optimizer.md +++ b/docs/user-guide/features/dist_optimizer.md @@ -1,3 +1,12 @@ + + # Distributed Optimizer The motivation for the distributed optimizer is to save memory by distributing the optimizer state evenly across data parallel ranks (https://arxiv.org/abs/1910.02054), versus the naive method of replicating the optimizer state across data parallel ranks. diff --git a/docs/user-guide/features/fine_grained_activation_offloading.md b/docs/user-guide/features/fine_grained_activation_offloading.md index 53211d1d06c..494674bd4f0 100644 --- a/docs/user-guide/features/fine_grained_activation_offloading.md +++ b/docs/user-guide/features/fine_grained_activation_offloading.md @@ -1,3 +1,12 @@ + + # Fine-grained Activation Offloading (collaborated with rednote) Memory capacity is more and more important with the rising of extreme sparse MoE models like DeepSeek-V3 and Qwen3-235B. Fine-grained recomputing reduces the memory footprint at the cost of extra recomputation, while offloading could utilize the host-device bandwidth to achieve nearly zero-overhead. Fine-grained Activation Offloading targets at offloading the activation at the granularity of specific modules, so that we can calibrate the amount of offloading activation to maximize the training throughput. diff --git a/docs/user-guide/features/index.md b/docs/user-guide/features/index.md index 7730443e91b..fc5a1aa1abe 100644 --- a/docs/user-guide/features/index.md +++ b/docs/user-guide/features/index.md @@ -1,3 +1,12 @@ + + # Advanced Features Advanced feature guides for key Megatron Core capabilities. diff --git a/docs/user-guide/features/megatron_energon.md b/docs/user-guide/features/megatron_energon.md index d08bde21e38..9ebba72083a 100644 --- a/docs/user-guide/features/megatron_energon.md +++ b/docs/user-guide/features/megatron_energon.md @@ -1,3 +1,12 @@ + + # Megatron Energon Advanced multimodal dataloader for efficient loading of text, images, video, and audio at scale. diff --git a/docs/user-guide/features/megatron_rl.md b/docs/user-guide/features/megatron_rl.md index 128b41bdaf5..653ecb92459 100644 --- a/docs/user-guide/features/megatron_rl.md +++ b/docs/user-guide/features/megatron_rl.md @@ -1,3 +1,12 @@ + + # Megatron RL Reinforcement learning library for post-training large language models at scale. diff --git a/docs/user-guide/features/moe.md b/docs/user-guide/features/moe.md index 56aca8c6999..2ba4ce4452e 100644 --- a/docs/user-guide/features/moe.md +++ b/docs/user-guide/features/moe.md @@ -1,3 +1,12 @@ + + # Mixture of Experts ```{toctree} diff --git a/docs/user-guide/features/multi_latent_attention.md b/docs/user-guide/features/multi_latent_attention.md index 5628b8cfee3..4310843557a 100644 --- a/docs/user-guide/features/multi_latent_attention.md +++ b/docs/user-guide/features/multi_latent_attention.md @@ -1,3 +1,12 @@ + + # Multi-Latent Attention ## Multi-Latent Attention overview diff --git a/docs/user-guide/features/multi_token_prediction.md b/docs/user-guide/features/multi_token_prediction.md index 891bf4c93c5..e16108bbcfa 100644 --- a/docs/user-guide/features/multi_token_prediction.md +++ b/docs/user-guide/features/multi_token_prediction.md @@ -1,3 +1,12 @@ + + # Multi-Token Prediction (MTP) Multi-Token Prediction (MTP) extends the prediction scope to multiple future tokens at each position. On the one hand, an MTP objective densifies the training signals and may improve @@ -7,7 +16,7 @@ data efficiency. On the other hand, MTP may enable the model to pre-plan its rep The k-th MTP module consists of a shared embedding layer, a projection matrix, a Transformer block, and a shared output head. For the i-th input token at the (k - 1)-th prediction depth, we first combine the representation of the i-th token and the embedding of the (i + K)-th token with the linear projection. The combined serves as the input of the Transformer block at the k-th depth to produce the output representation. -For more information, please refer to [DeepSeek-V3 Technical Report](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf) +For more information, refer to [DeepSeek-V3 Technical Report](https://arxiv.org/pdf/2412.19437.pdf) ## Related Arguments @@ -45,4 +54,4 @@ Use `m` to represent MTP layers in the pipeline layout string. For example: ## Precautions -Please do not use Context Parallel (CP), or arbitrary AttnMaskType, or learned absolute position embedding type with MTP. These use cases are not yet supported. +Do not use Context Parallel (CP), or arbitrary AttnMaskType, or learned absolute position embedding type with MTP. These use cases are not yet supported. diff --git a/docs/user-guide/features/optimizer_cpu_offload.md b/docs/user-guide/features/optimizer_cpu_offload.md index 408d7f6a788..1496bd0a91e 100644 --- a/docs/user-guide/features/optimizer_cpu_offload.md +++ b/docs/user-guide/features/optimizer_cpu_offload.md @@ -1,3 +1,12 @@ + + # Optimizer CPU Offload ```{include} ../../../megatron/core/optimizer/cpu_offloading/README.md diff --git a/docs/user-guide/features/pipeline_parallel_layout.md b/docs/user-guide/features/pipeline_parallel_layout.md index 30c8ce1a500..96b00eca004 100644 --- a/docs/user-guide/features/pipeline_parallel_layout.md +++ b/docs/user-guide/features/pipeline_parallel_layout.md @@ -1,3 +1,12 @@ + + # Custom Pipeline Model Parallel Layout *This is an experimental feature and may be changed.* diff --git a/docs/user-guide/features/tokenizers.md b/docs/user-guide/features/tokenizers.md index 0aecf8df8a7..bc1a47cec76 100644 --- a/docs/user-guide/features/tokenizers.md +++ b/docs/user-guide/features/tokenizers.md @@ -1,3 +1,12 @@ + + # Tokenizers Megatron Core provides a unified tokenizer system with a HuggingFace-style API for easy tokenizer management and configuration. @@ -141,7 +150,7 @@ Use a null tokenizer for testing or non-text models: ```python tokenizer = MegatronTokenizer.from_pretrained( - metadata_path={"library": "null"}, + metadata_path={"library": "null-text"}, vocab_size=131072, ) ``` @@ -173,16 +182,6 @@ torchrun --nproc_per_node=8 pretrain_gpt.py \ If `--tokenizer-metadata` is not specified, a default metadata file is generated automatically based on the tokenizer type. -### Legacy Tokenizer Support - -The old tokenizer system is still supported for backward compatibility: - -```bash -torchrun --nproc_per_node=8 pretrain_gpt.py \ - --legacy-tokenizer \ - ... -``` - ## Supported Tokenizer Libraries | Library | Description | Use Case | diff --git a/docs/user-guide/index.md b/docs/user-guide/index.md index bbe85451582..45e70c3a520 100644 --- a/docs/user-guide/index.md +++ b/docs/user-guide/index.md @@ -1,3 +1,12 @@ + + # User Guide Comprehensive guides for using Megatron Core and Megatron-LM. diff --git a/docs/user-guide/msc_integration.md b/docs/user-guide/msc_integration.md index fd73ac7e8f4..a197f25afc1 100644 --- a/docs/user-guide/msc_integration.md +++ b/docs/user-guide/msc_integration.md @@ -1,3 +1,12 @@ + + ```{include} ../../megatron/core/MSC_Integration.md ``` diff --git a/docs/user-guide/parallelism-guide.md b/docs/user-guide/parallelism-guide.md index 2baf518ae85..8d5cb8ff7c3 100644 --- a/docs/user-guide/parallelism-guide.md +++ b/docs/user-guide/parallelism-guide.md @@ -1,3 +1,12 @@ + + # Parallelism Strategies Guide Megatron Core supports multiple parallelism strategies that can be combined to efficiently train models from billions to trillions of parameters across thousands of GPUs. diff --git a/docs/user-guide/quickstart.md b/docs/user-guide/quickstart.md deleted file mode 100644 index 7baed06d6be..00000000000 --- a/docs/user-guide/quickstart.md +++ /dev/null @@ -1,3 +0,0 @@ -```{include} ../../megatron/core/QuickStart.md -``` - diff --git a/docs/user-guide/training-examples.md b/docs/user-guide/training-examples.md index 2824c608c36..425728c9e74 100644 --- a/docs/user-guide/training-examples.md +++ b/docs/user-guide/training-examples.md @@ -1,3 +1,12 @@ + + # Training Examples Get started with Megatron Core training using these practical examples. diff --git a/docs/versions1.json b/docs/versions1.json index a524c5921a8..ae1809d538e 100644 --- a/docs/versions1.json +++ b/docs/versions1.json @@ -1,7 +1,12 @@ [ { - "name": "latest", - "version": "latest", + "name": "nightly", + "version": "nightly", + "url": "https://docs.nvidia.com/megatron-core/nightly/" + }, + { + "name": "0.16.0 (latest)", + "version": "0.16.0", "url": "https://docs.nvidia.com/megatron-core/latest/" }, { diff --git a/examples/gpt3/gpt_config.yaml b/examples/gpt3/gpt_config.yaml index 18d305d9cb1..600f50221ce 100644 --- a/examples/gpt3/gpt_config.yaml +++ b/examples/gpt3/gpt_config.yaml @@ -92,7 +92,6 @@ model_parallel: # Optimizations gradient_accumulation_fusion: True - async_tensor_model_parallel_allreduce: True tp_comm_overlap: False # Debug Options diff --git a/examples/inference/gpt/gpt_dynamic_inference.py b/examples/inference/gpt/gpt_dynamic_inference.py index 88b744b3ac0..2582f57e13e 100644 --- a/examples/inference/gpt/gpt_dynamic_inference.py +++ b/examples/inference/gpt/gpt_dynamic_inference.py @@ -1,40 +1,31 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# pylint: disable=bad-builtin + import hashlib import io import json -import math import os -import pickle import sys import warnings -import torch -from argparse import ArgumentParser from collections import defaultdict -from functools import partial +from typing import Dict, List, Optional + +import torch from tqdm import tqdm -from typing import Dict, List, Tuple, Optional sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) ) -import megatron from examples.inference.gpt.utils import ( Request, - add_common_inference_args, build_dynamic_engine_setup_prefix, build_requests, get_curr_time, get_global_peak_memory_stats_bytes, ) -from megatron.core.inference.contexts.dynamic_context import ( - ContextOverflowError, - DynamicInferenceContext, -) -from megatron.core.inference.contexts.attention_context.mamba_metadata import ( - MambaInferenceStateConfig, -) +from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.engines import DynamicInferenceEngine, EngineSuspendedError from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, @@ -43,195 +34,27 @@ from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) -from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer -from megatron.core.transformer.module import MegatronModule -from megatron.core.utils import get_mamba_inference_state_config_from_model +from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer +from megatron.inference.utils import ( + add_inference_args, + get_inference_config_from_model_and_args, + get_model_for_inference, +) sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) ) -from megatron.training import get_args, get_model as _get_model, get_tokenizer, initialize_megatron -from megatron.training.checkpointing import load_checkpoint -from model_provider import model_provider -from gpt_builders import gpt_builder -from mamba_builders import mamba_builder +import logging +import megatron from megatron.core.utils import configure_nvtx_profiling -import logging +from megatron.training import get_args, get_tokenizer, initialize_megatron torch.serialization.add_safe_globals([io.BytesIO]) torch.serialization.add_safe_globals([megatron.core.rerun_state_machine.RerunState]) torch.serialization.add_safe_globals([megatron.core.rerun_state_machine.RerunDiagnostic]) -def add_dynamic_inference_args(parser: ArgumentParser) -> ArgumentParser: - """Dynamic inference arguments.""" - - add_common_inference_args(parser) - - group = parser.add_argument_group(title='Dynamic inference') - group.add_argument( - "--inference-ckpt-non-strict", - action="store_true", - help="Load checkpoint with `strict=False`.", - ) - group.add_argument( - "--termination-id", type=int, default=None, - help="Termination ID that overrides `tokenizer.eod`.", - ) - group.add_argument( - "--suspend-resume-interval", type=int, default=None, - help="Suspend and resume the dynamic engine every " - "`suspend_resume_interval` steps. This is used to tet the suspend/resume " - "system.", - ) - group.add_argument( - "--inference-repeat-n", type=int, default=1, - help="Repeat inference iterations N times for benchmarking." - ) - group.add_argument( - "--throughput-check-only", - action='store_true', - default=False, - help="If true, only run throughput check without verifying outputs." - ) - - return parser - - -def get_model() -> MegatronModule: - """Initialize model and load checkpoint.""" - - args = get_args() - - if args.model_provider == "gpt": - model_builder = gpt_builder - elif args.model_provider == "mamba": - model_builder = mamba_builder - else: - raise ValueError(f"Invalid model provider {args.model_provider}") - - # Build model. - model = _get_model( - partial(model_provider, model_builder), - wrap_with_ddp=False - ) - - # Load checkpoint. - assert args.load is not None - args.exit_on_missing_checkpoint = True - load_checkpoint( - ddp_model=model, - optimizer=None, - opt_param_scheduler=None, - strict=not args.inference_ckpt_non_strict, - ) - - # No virtual PP. - assert len(model) == 1, "Above condition should have caught this" - model = model[0] - - # Eval mode. - model.eval() - - return model - - -def get_inference_context( - requests: List[Request], - sampling_params: Optional[SamplingParams] = None, - calculate_max_sequence_length_from_requests: bool = True, - mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None, -): - """The inference context manages the KV cache and other inference state.""" - - args = get_args() - - # Max sequence length. - if calculate_max_sequence_length_from_requests: - max_gen_length = sampling_params.num_tokens_to_generate - max_context_length = max(len(r.prompt_tokens) for r in requests) - max_sequence_length = max_context_length + max_gen_length - else: - max_sequence_length = args.inference_max_seq_length - - metrics_writer = None - if args.inference_logging_step_interval > 0 and args.inference_wandb_logging: - metrics_writer = get_wandb_writer() - - # Inference context. - context = DynamicInferenceContext( - params_dtype=args.params_dtype, - num_layers=args.num_layers // args.pipeline_model_parallel_size, - kv_channels=args.kv_channels, - num_attention_heads=( - args.num_query_groups if args.group_query_attention else args.num_attention_heads - ), - max_sequence_length=max_sequence_length, - num_cuda_graphs=( - args.inference_dynamic_batching_num_cuda_graphs - if args.cuda_graph_impl == "local" - else None - ), - block_size_tokens=args.inference_dynamic_batching_block_size, - buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb, - paused_buffer_size_gb=args.inference_dynamic_batching_paused_buffer_size_gb, - max_requests=args.inference_dynamic_batching_max_requests, - max_tokens=args.inference_dynamic_batching_max_tokens, - tensor_model_parallel_size=args.tensor_model_parallel_size, - pipeline_model_parallel_size=args.pipeline_model_parallel_size, - materialize_only_last_token_logits=not args.return_log_probs, - mamba_inference_state_config=mamba_inference_state_config, - cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents, - kv_lora_rank=args.kv_lora_rank if args.multi_latent_attention else None, - qk_pos_emb_head_dim=args.qk_pos_emb_head_dim, - use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs, - use_flashinfer_fused_rope=args.use_flashinfer_fused_rope, - unified_memory_level=args.inference_dynamic_batching_unified_memory_level, - cuda_graph_max_tokens=args.inference_dynamic_batching_cuda_graph_max_tokens, - cuda_graph_mixed_prefill_count=args.inference_dynamic_batching_cuda_graph_mixed_prefill_count, - metrics_writer=metrics_writer, - offload_kv_cache=args.rl_offload_kv_cache_during_training - ) - - return context - - -def get_inference_controller( - model: MegatronModule, context: DynamicInferenceContext -) -> TextGenerationController: - """Buid text generation controller, which manages the model inference context. - - Args: - model (MegatronModule): Megatron GPT model. - context (DynamicInferenceContext): Context for managing KV cache blocks. - - Return: - (TextGenerationController) Inference text generation controller. - """ - - args = get_args() - if args.legacy_tokenizer: - tokenizer = get_tokenizer() - else: - tokenizer = build_tokenizer(args) - - # Wrap model in inference wrapper. - model = GPTInferenceWrapper(model, args, context) - - # Note: the following is taken from AbstractModelInferenceWrapper.prep_model_for_inference(). - from megatron.core import parallel_state - - model.model_is_pipeline_parallel = not ( - parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage() - ) - - # Text generation controller. - controller = TextGenerationController(model, tokenizer) - - return controller - - def run_inference( requests: List[Request], engine: DynamicInferenceEngine, @@ -284,11 +107,7 @@ def _add_request(): """ nonlocal num_requests_added _request = requests[num_requests_added] - engine.add_request( - num_requests_added, - _request.prompt_text, - _request.sampling_params, - ) + engine.add_request(num_requests_added, _request.prompt_text, _request.sampling_params) _request.time_start = get_curr_time() _request.state = "started" num_requests_added += 1 @@ -305,10 +124,9 @@ def _add_request(): _add_request() else: # Add deterministic number of requests (generally used for debugging). - for i in range(min( - args.incoming_requests_per_step, - num_requests_total - num_requests_added, - )): + for i in range( + min(args.incoming_requests_per_step, num_requests_total - num_requests_added) + ): _add_request() add_times.append(get_curr_time() - add_start) @@ -318,11 +136,12 @@ def _add_request(): result = engine.step_modern() except EngineSuspendedError as e: result = e - pass # ignore error in order to call 'engine.resume()' below. + pass # ignore error in order to call 'engine.resume()' below. attempted_step_count += 1 - # After step, we lost track of last iteration's is_decode_only, so we need to get it from the engine - is_decode_only = engine.is_decode_only + # After step, we lost track of last iteration's is_decode_only, + # so we need to get it from the engine + is_decode_only = engine.is_decode_only # Test suspending and resuming engine. if args.suspend_resume_interval is not None: @@ -335,9 +154,9 @@ def _add_request(): # Resume, 0+ attempted steps later. if ( attempted_step_count > 0 - and - (attempted_step_count - args.suspend_resume_interval // 2) - % args.suspend_resume_interval == 0 + and (attempted_step_count - args.suspend_resume_interval // 2) + % args.suspend_resume_interval + == 0 ): print("**** step %d/%d ... resume." % (engine.step_count, attempted_step_count)) engine.resume() @@ -349,7 +168,9 @@ def _add_request(): # Record cuda_graph_request_count. cuda_graph_request_count = result["cuda_graph_request_count"] if args.cuda_graph_impl == "local" and cuda_graph_request_count is not None: - cuda_graph_request_count_map[cuda_graph_request_count] = cuda_graph_request_count_map.get(cuda_graph_request_count, 0) + 1 + cuda_graph_request_count_map[cuda_graph_request_count] = ( + cuda_graph_request_count_map.get(cuda_graph_request_count, 0) + 1 + ) # Update requests. active_request_ids = result["active_request_ids"] @@ -374,6 +195,8 @@ def _add_request(): request.request_id = finished_request.request_id request.events = finished_request.events + request.ttft = finished_request.ttft + # Update prompt, in case engine has been suspended and resumed. request.prompt_tokens = finished_request.prompt_tokens.tolist() request.prompt_text = finished_request.prompt @@ -408,38 +231,37 @@ def _add_request(): engine.resume() return { - "step_times" : step_times, - "add_times" : add_times, - "output_times" : output_times, - "total_output_tokens" : total_output_tokens, - "cuda_graph_request_count_map" : cuda_graph_request_count_map, + "step_times": step_times, + "add_times": add_times, + "output_times": output_times, + "total_output_tokens": total_output_tokens, + "cuda_graph_request_count_map": cuda_graph_request_count_map, } @torch.inference_mode() def main(): - + """Run dynamic inference.""" # Initialize Megatron. initialize_megatron( - extra_args_provider=add_dynamic_inference_args, + extra_args_provider=add_inference_args, args_defaults={'no_load_rng': True, 'no_load_optim': True}, ) # Start Nsight profiler. if os.environ.get("NSIGHT_PREFIX"): torch.cuda.cudart().cudaProfilerStart() - - level_str = os.getenv("LOG_LEVEL", "INFO").upper() - level = getattr(logging, level_str, logging.INFO) + + level_str = os.getenv("LOG_LEVEL", "INFO").upper() + level = getattr(logging, level_str, logging.INFO) logging.basicConfig(level=level, force=True) configure_nvtx_profiling(True) args = get_args() - if args.legacy_tokenizer: - tokenizer = get_tokenizer() - else: - tokenizer = build_tokenizer(args) + + # Build tokenizer + tokenizer = build_tokenizer(args) # Reset peak memory stats so functional tests measure this run and not # whatever happened earlier during initialization. @@ -456,42 +278,36 @@ def main(): termination_id=args.termination_id if args.termination_id is not None else tokenizer.eod, top_n_logprobs=args.top_n_logprobs, stop_words=args.stop_words, - ) - - model = get_model() + ) - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) + model = get_model_for_inference() # Requests, context, controller. requests = build_requests(args, tokenizer, sampling_params) - context = get_inference_context( - requests, - sampling_params, - mamba_inference_state_config=mamba_inference_state_config, - ) - controller = get_inference_controller(model, context) + inference_config = get_inference_config_from_model_and_args(model, args) + + # Calculate max_sequence_length from requests + max_gen_length = sampling_params.num_tokens_to_generate + max_context_length = max(len(r.prompt_tokens) for r in requests) + inference_config.max_sequence_length = max_context_length + max_gen_length + context = DynamicInferenceContext(model.config, inference_config) + wrapped_model = GPTInferenceWrapper(model, context) + controller = TextGenerationController(wrapped_model, tokenizer) # Validate all context_length's <= max_tokens. - if args.disable_chunked_prefill: + if not args.enable_chunked_prefill: invalid_prompt_length_map = {} for request_idx, request in enumerate(requests): if len(request.prompt_tokens) > context.max_tokens: invalid_prompt_length_map[request_idx] = len(request.prompt_tokens) - assert not invalid_prompt_length_map, ( - "request idxs with prompts longer than context.max_tokens: " - ", ".join(f"{k}({v})" for k, v in invalid_prompt_length_map.items()) + assert ( + not invalid_prompt_length_map + ), "request idxs with prompts longer than context.max_tokens: " ", ".join( + f"{k}({v})" for k, v in invalid_prompt_length_map.items() ) # Inference engine. - engine = DynamicInferenceEngine( - controller, - context, - enable_cuda_graph=args.cuda_graph_impl == "local", - random_seed=args.seed, - track_paused_request_events=args.inference_dynamic_batching_track_paused_request_events, - enable_chunked_prefill=not args.disable_chunked_prefill, - inference_logging_step_interval=args.inference_logging_step_interval, - ) + engine = DynamicInferenceEngine(controller, context) setup_prefix = build_dynamic_engine_setup_prefix(args, model, context, requests) print("~~~") @@ -522,14 +338,13 @@ def main(): # Validate all requests finished. for request in requests: - assert request.state == "finished", ( - f"request.state == '{request.state}' != 'finished'." - ) + assert request.state == "finished", f"request.state == '{request.state}' != 'finished'." peak_mem_stats = get_global_peak_memory_stats_bytes() # Print unique prompts + outputs. if torch.distributed.get_rank() == 0: + def escape_str(s): return s.replace("\n", "\\n") @@ -547,7 +362,10 @@ def escape_str(s): # ---- Prompt summary line ---- prompt_len = len(requests[request_idxs[0]].prompt_tokens) escaped_prompt_text = escape_str(prompt_text) - print(f"\n{unique_idx+1}/{len(unique_prompt_map)} [n {len(request_idxs)}, l {prompt_len}] {escaped_prompt_text}") + print( + f"\n{unique_idx+1}/{len(unique_prompt_map)}" + f"[n {len(request_idxs)}, l {prompt_len}] {escaped_prompt_text}" + ) # ---- Group all outputs for this prompt ---- output_map = defaultdict(list) @@ -567,16 +385,17 @@ def escape_str(s): # Use hash of prompt + generated text in case engine was # suspended and resumed, which misaligns boundary between # prompt and generated tokens. - o_hash = hashlib.sha256( - (prompt_text + output_text).encode() - ).hexdigest()[:6] + o_hash = hashlib.sha256((prompt_text + output_text).encode()).hexdigest()[:6] o_len = len(requests[output_request_idxs[0]].output_tokens) escaped_output_text = escape_str(output_text) else: o_hash = "--" o_len = 0 escaped_output_text = "--" - print(f" >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}{', ' if evicted else ''}] {escaped_output_text}") + print( + f" >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}" + f"{', ' if evicted else ''}] {escaped_output_text}" + ) text_hashes.append(o_hash) # Write results to JSON. Primarily used for functional testing. @@ -592,14 +411,17 @@ def escape_str(s): "generated_text": req.output_text, "generated_tokens": req.output_tokens, "latency": req.time_end - req.time_start, - "cuda_graph_request_count_map" : result["cuda_graph_request_count_map"], - "step_count" : engine.step_count, - "top_n_logprobs" : getattr(req, 'generated_top_n_logprobs', None), - "prompt_top_n_logprobs" : getattr(req, 'prompt_top_n_logprobs', None), + "ttft": req.ttft, # Time-to-first-token in seconds + "cuda_graph_request_count_map": result["cuda_graph_request_count_map"], + "step_count": engine.step_count, + "top_n_logprobs": getattr(req, 'generated_top_n_logprobs', None), + "prompt_top_n_logprobs": getattr(req, 'prompt_top_n_logprobs', None), } if req.sampling_params.return_log_probs: result_dict["prompt_logprobs"] = getattr(req, 'prompt_log_probs', None) - result_dict["generated_logprobs"] = getattr(req, 'generated_log_probs', None) + result_dict["generated_logprobs"] = getattr( + req, 'generated_log_probs', None + ) result_dict["logprobs"] = getattr(req, 'logprobs', None) json_results[req.request_id] = result_dict @@ -631,7 +453,7 @@ def escape_str(s): d_count = len(d_times) p_mean = p_total / p_count - d_mean = d_total / d_count if d_count != 0 else 0. + d_mean = d_total / d_count if d_count != 0 else 0.0 # Commented out for now as the step/add/output times are not calculated correctly. # print( @@ -643,18 +465,13 @@ def escape_str(s): # f"mean [ p {p_mean:.3f}s, d {d_mean:.3f}s ], " # f"count [ p {p_count}, d {d_count} ]." # ) - capture_str = ( - f"{engine.capture_stats['time']:.2f} sec" - if engine.capture_stats else - "--" - ) + capture_str = f"{engine.capture_stats['time']:.2f} sec" if engine.capture_stats else "--" print( - f"{setup_prefix} … " - f"throughput: {throughput:.3f} tok/s … ", + f"{setup_prefix} … " f"throughput: {throughput:.3f} tok/s … ", f"total time: {total_time:.3f}s … " f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … " f"steps: {engine.step_count:d} … " - f"capture {capture_str}" + f"capture {capture_str}", ) print("~~~") diff --git a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py index cbb7a1aa745..536f533eccd 100644 --- a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py +++ b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py @@ -2,43 +2,33 @@ import asyncio import json +import logging import os import time -import torch -import torch.distributed as dist +import warnings from collections import defaultdict -from tqdm import tqdm from typing import List -import warnings -import logging -from examples.inference.gpt.gpt_dynamic_inference import ( - add_dynamic_inference_args, - get_inference_context, - get_inference_controller, - get_model, -) -from examples.inference.gpt.utils import ( - Request, - build_dynamic_engine_setup_prefix, - build_requests, - add_common_inference_args -) +import torch +import torch.distributed as dist -from megatron.core import parallel_state +from examples.inference.gpt.utils import Request, build_dynamic_engine_setup_prefix, build_requests from megatron.core.inference.engines import DynamicInferenceEngine from megatron.core.inference.inference_client import InferenceClient from megatron.core.inference.inference_request import DynamicInferenceRequestRecord from megatron.core.inference.sampling_params import SamplingParams -from megatron.core.utils import get_mamba_inference_state_config_from_model - +from megatron.inference.utils import ( + add_inference_args, + get_dynamic_inference_engine, + get_model_for_inference, +) from megatron.training import get_args, get_tokenizer, initialize_megatron -from megatron.training.arguments import parse_args # pylint: disable=line-too-long logging.basicConfig(level=logging.INFO, force=True) + async def main( engine: DynamicInferenceEngine, requests: List[Request], @@ -51,12 +41,11 @@ async def main( "Sampling parameters are specified per request.", DeprecationWarning, ) - + # once you call engine.start_listening_to_data_parallel_coordinator, # the engine will start accepting requests from the data parallel coordinator. # and processing them in an asyncio coroutine. # leaving inference_coordinator_port as None will find a free port automatically. - dp_addr = await engine.start_listening_to_data_parallel_coordinator( inference_coordinator_port=port, launch_inference_coordinator=True, @@ -69,14 +58,11 @@ async def main( # Since the client doesn't directly call engine.async_step here, we test # the suspend-resume system ~4 times. suspend_resume_interval = max(1, len(requests) // 4) - suspend_idxs = set(range( - suspend_resume_interval, - len(requests) + 1, - suspend_resume_interval, - )) + suspend_idxs = set( + range(suspend_resume_interval, len(requests) + 1, suspend_resume_interval) + ) resume_idxs = set( - min(len(requests), i + suspend_resume_interval // 2) - for i in suspend_idxs + min(len(requests), i + suspend_resume_interval // 2) for i in suspend_idxs ) else: suspend_idxs = set() @@ -98,7 +84,10 @@ async def main( current_time = time.time_ns() / 10**9 if args.incoming_requests_per_step is None: # Only add requests that have arrived at the current time. - while num_requests_added < num_requests_total and requests[num_requests_added].time_arrival <= current_time: + while ( + num_requests_added < num_requests_total + and requests[num_requests_added].time_arrival <= current_time + ): request = requests[num_requests_added] # These add-request calls will queue up the request on a zmq socket and return # instantaneously. They will return an asyncio future which can be awaited for @@ -114,10 +103,9 @@ async def main( else: # Add deterministic number of requests (generally used for debugging). - for i in range(min( - args.incoming_requests_per_step, - num_requests_total - num_requests_added - )): + for i in range( + min(args.incoming_requests_per_step, num_requests_total - num_requests_added) + ): # Change sampling parameters to force different generation lengths. request = requests[num_requests_added] n = request.sampling_params.num_tokens_to_generate @@ -135,7 +123,7 @@ async def main( break # Relinquish control since there are no more requests to add at the moment. This allows the engine to run. await asyncio.sleep(0) - + # While we wait for the requests to complete, the engine runs in the background. results: List[DynamicInferenceRequestRecord] = await asyncio.gather(*futures) @@ -157,6 +145,9 @@ async def main( result_dict["logprobs"] = req.prompt_log_probs + req.generated_log_probs throughput = len(req.generated_tokens) / req.latency throughputs.append(throughput) + if req.routing_indices is not None: + result_dict["routing_indices"] = req.routing_indices.tolist() + json_results[req.request_id] = result_dict throughput_dict = {"throughput": throughputs} if args.throughput_check_only: @@ -170,16 +161,19 @@ async def main( req = record.merge() unique_prompt_map[req.prompt].append(req) for idx, (prompt_text, reqs) in enumerate(unique_prompt_map.items()): - print(f"%d/%d. prompt '%s' ... [%d] output '%s'." % ( - idx, - len(unique_prompt_map), - prompt_text.replace("\n", "\\n"), - len(reqs), - reqs[0].generated_text.replace("\n", "\\n"), - )) + print( + f"%d/%d. prompt '%s' ... [%d] output '%s'." + % ( + idx, + len(unique_prompt_map), + prompt_text.replace("\n", "\\n"), + len(reqs), + reqs[0].generated_text.replace("\n", "\\n"), + ) + ) # kill the engines and suspend the client - # Right now, we can only call stop when all requests are done. + # Right now, we can only call stop when all requests are done. # Todo: Make this explicit in the Client class.... await client.stop_engines() client.stop() @@ -190,11 +184,11 @@ async def main( if __name__ == "__main__": - # enable inference mode in the very beginning as some fp-8 optimizations + # enable inference mode in the very beginning as some fp8 optimizations # check for it. with torch.inference_mode(): initialize_megatron( - extra_args_provider=add_dynamic_inference_args, + extra_args_provider=add_inference_args, args_defaults={'no_load_rng': True, 'no_load_optim': True}, ) @@ -213,34 +207,16 @@ async def main( ), ) - # Requests, context, conroller. - model = get_model() - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) + model = get_model_for_inference() + requests = ( build_requests(args, tokenizer, sampling_params) if dist.get_rank() == 0 else None ) - context = get_inference_context( - None, - None, - calculate_max_sequence_length_from_requests=False, - mamba_inference_state_config=mamba_inference_state_config, - ) - - controller = get_inference_controller(model, context) - - # Inference engine. - engine = DynamicInferenceEngine( - controller, - context, - enable_cuda_graph=args.cuda_graph_impl == "local", - random_seed=args.seed, - enable_chunked_prefill=not args.disable_chunked_prefill, - inference_logging_step_interval=args.inference_logging_step_interval, - ) + engine = get_dynamic_inference_engine(model=model) if dist.get_rank() == 0: - setup_prefix = build_dynamic_engine_setup_prefix(args, model, context, requests) + setup_prefix = build_dynamic_engine_setup_prefix(args, model, engine.context, requests) print("~~~") print(setup_prefix) print("~~~") @@ -249,13 +225,7 @@ async def main( if os.environ.get("NSIGHT_PREFIX"): torch.cuda.cudart().cudaProfilerStart() - asyncio.run( - main( - engine, - requests, - args.inference_coordinator_port, - ) - ) + asyncio.run(main(engine, requests, args.inference_coordinator_port)) # Stop Nsight profiler. if os.environ.get("NSIGHT_PREFIX"): diff --git a/examples/inference/gpt/gpt_static_inference.py b/examples/inference/gpt/gpt_static_inference.py index 03a60927ab2..17cf7c53b05 100644 --- a/examples/inference/gpt/gpt_static_inference.py +++ b/examples/inference/gpt/gpt_static_inference.py @@ -1,21 +1,11 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. import os -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) -from model_provider import model_provider -from gpt_builders import gpt_builder -from mamba_builders import mamba_builder -import torch import sys import time -import warnings -from functools import partial from argparse import Namespace import torch -import tqdm from megatron.core.inference.contexts import StaticInferenceContext from megatron.core.inference.engines import StaticInferenceEngine @@ -23,17 +13,12 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) -from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer +from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer from megatron.core.transformer.module import MegatronModule -from pretrain_gpt import model_provider as gpt_model_provider -from pretrain_mamba import model_provider as mamba_model_provider sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) @@ -41,18 +26,18 @@ import asyncio import json -from typing import Any, AsyncIterator, List +from typing import List -from examples.inference.gpt.utils import add_common_inference_args, build_requests -from megatron.core import mpu -from megatron.training import get_args, get_model, get_tokenizer, print_rank_0 -from megatron.training.checkpointing import load_checkpoint +from examples.inference.gpt.utils import build_requests +from megatron.inference.utils import add_inference_args, get_model_for_inference +from megatron.training import get_args, get_tokenizer, print_rank_0 from megatron.training.initialize import initialize_megatron + def add_static_inference_args(parser): """Static inference arguments.""" - add_common_inference_args(parser) + add_inference_args(parser) group = parser.add_argument_group(title='Static inference') group.add_argument( @@ -79,34 +64,17 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> StaticInfere Returns: AbstractBackend: The chosen backend """ - if args.legacy_tokenizer: - tokenizer = get_tokenizer() - else: - tokenizer = build_tokenizer(args) - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=args.hidden_size, - inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, - fp32_residual_connection=args.fp32_residual_connection, - params_dtype=args.params_dtype, - padded_vocab_size=args.padded_vocab_size, - inference_max_requests=args.inference_max_batch_size, - inference_max_seq_length=args.inference_max_seq_length, - nccl_all_reduce_for_prefill=args.nccl_all_reduce_for_prefill, - fp8=args.fp8, - moe_pad_experts_for_cuda_graph_inference = args.moe_pad_experts_for_cuda_graph_inference - ) - - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) - - inference_wrapped_model = GPTInferenceWrapper( - model, inference_wrapper_config, inference_context + tokenizer = build_tokenizer(args) + inference_context = StaticInferenceContext( + args.inference_max_requests, args.inference_max_seq_length ) + inference_wrapped_model = GPTInferenceWrapper(model, inference_context) text_generation_controller = TextGenerationController( inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer ) engine_kwargs = { - "text_generation_controller" : text_generation_controller, - "legacy" : args.use_legacy_static_engine, + "text_generation_controller": text_generation_controller, + "legacy": args.use_legacy_static_engine, } if not args.use_legacy_static_engine: engine_kwargs["buffer_size_gb"] = args.inference_dynamic_batching_buffer_size_gb @@ -165,22 +133,7 @@ def main(): args = get_args() - if args.max_batch_size is not None: - warnings.warn( - f"`--max-batch-size` has been deprecated in favor of `--inference-max-requests`." - ) - args.inference_max_batch_size = max(args.max_batch_size, args.inference_max_batch_size) - - # Set up model and load checkpoint - if args.model_provider == "gpt": - model_builder = gpt_builder - elif args.model_provider == "mamba": - model_builder = mamba_builder - else: - raise ValueError(f"Invalid model provider {args.model_provider}") - model = get_model(partial(model_provider, model_builder), wrap_with_ddp=False) - load_checkpoint(model, None, None, strict=False) - model = model[0] + model = get_model_for_inference() inference_engine = get_inference_engine(args, model) @@ -193,10 +146,9 @@ def main(): top_n_logprobs=args.top_n_logprobs, ) - if args.legacy_tokenizer: - tokenizer = get_tokenizer() - else: - tokenizer = build_tokenizer(args) + # Build tokenizer + tokenizer = build_tokenizer(args) + requests = build_requests(args, tokenizer) prompts = [r.prompt_text for r in requests] @@ -276,7 +228,7 @@ def main(): ) ), len(requests), - args.inference_max_batch_size, + args.inference_max_requests, stats["allocated_bytes.all.peak"] / (1024**3), stats["reserved_bytes.all.peak"] / (1024**3), latency, @@ -293,6 +245,5 @@ def main(): torch.distributed.destroy_process_group() - if __name__ == "__main__": main() diff --git a/examples/inference/gpt/utils.py b/examples/inference/gpt/utils.py index a04b856c0a6..eceb5fa16fd 100644 --- a/examples/inference/gpt/utils.py +++ b/examples/inference/gpt/utils.py @@ -1,158 +1,23 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import copy -import json import itertools +import json import random import time -import torch from argparse import ArgumentParser, Namespace -from tqdm import tqdm +from functools import partial from typing import Any, List, Optional -from megatron.core.inference.inference_request import DynamicInferenceRequest +import torch +from tqdm import tqdm + from megatron.core.inference.contexts import DynamicInferenceContext from megatron.core.inference.contexts.dynamic_context import get_mem_size_str -from megatron.core.transformer.module import MegatronModule - +from megatron.core.inference.inference_request import DynamicInferenceRequest from megatron.core.inference.sampling_params import SamplingParams - - -def add_common_inference_args(parser: ArgumentParser) -> ArgumentParser: - """Common inference arguments.""" - - group = parser.add_argument_group(title='Common inference') - - group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.') - group.add_argument("--top_k", type=int, default=1, help='Top k sampling.') - group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.') - group.add_argument( - "--return-log-probs", - action='store_true', - default=False, - help='Return the log probabilities of the final output tokens', - ) - group.add_argument( - "--prompts", - metavar='N', - type=str, - nargs='+', - help='Input prompts with each prompt within quotes and seperated by space', - ) - group.add_argument( - "--num-tokens-to-prompt", - type=int, - nargs="+", - default=[64, 1024], - help='Number of tokens to use for simulated prompts. This should be a ' - 'space-separated pair of integers, and the generated prompt lengths will ' - 'be uniformly sampled within this range.', - ) - group.add_argument( - "--num-tokens-to-generate", - type=int, - default=30, - help='Number of tokens to generate for each prompt', - ) - group.add_argument( - "--num-tokens-from-file", - action='store_true', - default=False, - help='Use per-prompt num_tokens_to_generate from prompt file', - ) - group.add_argument( - "--top-n-logprobs", - type=int, - default=0, - help='Return the top n logprobs for the generated tokens and their corresponding token as a dictionary', - ) - group.add_argument( - "--incoming-requests-per-step", - type=int, default=None, - help="Add a deterministic number of requests per step. This arg is " - "prioritized over `--incoming-requests-per-sec` below (which is non-" - "deterministic). Note that the number of requests added per step is " - "additionally limited by the inference context's `max_requests`, " - "`max_tokens`, and KV buffer size.", - ) - group.add_argument( - "--incoming-requests-per-sec", - type=float, - default=100.0, - help="Simulated number of requests per second. Set to -1 to add all requests together.", - ) - group.add_argument( - "--incoming-requests-duration", - type=float, - default=10.0, - help="Total amount of time to simulate that requests are " - "arriving. Multiply this value with " - "`--incoming-requests-per-sec` to get the approximate " - "total number of requests. Set to -1 to add all requests together.", - ) - group.add_argument( - "--model-provider", - choices=["mamba", "gpt"], - default="gpt", - help="Model provider", - ) - group.add_argument( - "--skip-prompt-log-probs", - action='store_true', - default=False, - help='Skip prompt log probs.', - ) - group.add_argument( - "--stop-words", - metavar='WORD', - type=str, - nargs='+', - default=None, - help='Stop words to terminate generation. Each word should be quoted and ' - 'separated by space. Example: --stop-words "\\n\\n" "END" "###"', - ) - group.add_argument( - "--output-path", - type=str, - default=None, - help="Path to save generations as JSON", - ) - group.add_argument( - "--output-every-n-results", - type=int, - default=1, - help="To minimize the output file size of larger runs, only write the " - "results of every `n` requests.", - ) - group.add_argument( - "--prompt-file", - help='Jsonl file containing input prompts, where each item (i.e., line) ' - 'contains the field \'text\' where the value is the prompt. All other ' - 'fields within each item are ignored, and may be customized for each ' - 'application.', - ) - group.add_argument( - "--prompt-file-num-truncate", - type=int, - help='Number of samples to use from the loaded prompt file (see ' - '`--prompt-file` above). The first `--prompt-file-num-truncate` samples ' - 'will be used, in order.', - ) - group.add_argument( - "--use-flashinfer-fused-rope", - action='store_true', - default=False, - help='Use flashinfer fused rope implementation.', - ) - group.add_argument( - "--no-record-throughput", - action='store_false', - dest="record_throughput", - help="Disable throughput recording in --output-file" - - ) - - return parser +from megatron.core.transformer.module import MegatronModule +from megatron.training import get_args def get_default_sampling_params(termination_id: int = None): @@ -162,9 +27,10 @@ def get_default_sampling_params(termination_id: int = None): top_p=0.0, return_log_probs=False, num_tokens_to_generate=30, - termination_id = termination_id, + termination_id=termination_id, ) + def get_curr_time() -> float: """Get synchronized time across ranks.""" curr_time = torch.cuda.LongTensor([time.time_ns()]) @@ -188,7 +54,13 @@ class Request: tokenizer (Any): Tokenizer for tokenizing the prompt. """ - def __init__(self, prompt_text: str, time_offset: float, tokenizer: Any, sampling_params: SamplingParams = None): + def __init__( + self, + prompt_text: str, + time_offset: float, + tokenizer: Any, + sampling_params: SamplingParams = None, + ): self.prompt_text = prompt_text self.prompt_tokens = tokenizer.tokenize(prompt_text) self.output_text = None @@ -197,8 +69,13 @@ def __init__(self, prompt_text: str, time_offset: float, tokenizer: Any, samplin self.time_arrival = None self.time_start = None self.time_end = None + self.ttft = None # Time-to-first-token in seconds self.state = "not-started" - self.sampling_params: SamplingParams = sampling_params if sampling_params is not None else get_default_sampling_params(tokenizer.eod) + self.sampling_params: SamplingParams = ( + sampling_params + if sampling_params is not None + else get_default_sampling_params(tokenizer.eod) + ) self.sampling_params = copy.deepcopy(self.sampling_params) def __str__(self) -> str: @@ -225,10 +102,10 @@ def get_time_offsets( # if num_requests is not None: incoming_requests_duration = num_requests / incoming_requests_per_sec - incoming_requests_duration *= 2 # extra margin, to accomodate time sampling + incoming_requests_duration *= 2 # extra margin, to accomodate time sampling random.seed(seed) - + import simpy # Guard against this import in test case # Generate random time offsets. @@ -241,14 +118,14 @@ def arrival(r): env = simpy.Environment() env.process(arrival(incoming_requests_per_sec)) env.run(incoming_requests_duration) - + # Ensure at least a single request. if len(time_offsets) == 0: time_offsets = [0.0] # Ensure first time is 0. time_offsets = [to - time_offsets[0] for to in time_offsets] - + # Truncate to num_requests. assert len(time_offsets) >= num_requests time_offsets = time_offsets[:num_requests] @@ -257,7 +134,7 @@ def arrival(r): def get_cli_requests( - args: Namespace, tokenizer: Any, sampling_params: Optional[SamplingParams] = None + args: Namespace, tokenizer: Any, sampling_params: Optional[SamplingParams] = None ) -> list[Request]: # Get time offsets. @@ -269,7 +146,7 @@ def get_cli_requests( ) # Init requests. - requests = [Request(p, t, tokenizer, sampling_params) for p,t in zip(args.prompts, t_offsets)] + requests = [Request(p, t, tokenizer, sampling_params) for p, t in zip(args.prompts, t_offsets)] return requests @@ -289,18 +166,14 @@ def get_synthetic_requests( # Build prompts with expected lengths. assert ( len(args.num_tokens_to_prompt) == 2 - and - args.num_tokens_to_prompt[1] >= args.num_tokens_to_prompt[0] + and args.num_tokens_to_prompt[1] >= args.num_tokens_to_prompt[0] ) max_prompt_length = args.num_tokens_to_prompt[1] max_prompt_text = "hi " * max_prompt_length max_prompt_tokens = tokenizer.tokenize(max_prompt_text) - prompt_lengths = [ - random.randint(*args.num_tokens_to_prompt) - for _ in time_offsets - ] - prompt_tokens_list = [ max_prompt_tokens[:l] for l in prompt_lengths ] - prompt_texts = [ tokenizer.detokenize(tt) for tt in prompt_tokens_list ] + prompt_lengths = [random.randint(*args.num_tokens_to_prompt) for _ in time_offsets] + prompt_tokens_list = [max_prompt_tokens[:l] for l in prompt_lengths] + prompt_texts = [tokenizer.detokenize(tt) for tt in prompt_tokens_list] # Init requests. assert len(prompt_texts) == len(time_offsets) @@ -340,16 +213,15 @@ def get_requests_from_file( # Get time offsets. time_offsets: list[float] = get_time_offsets( - args.seed, - args.incoming_requests_per_step, - args.incoming_requests_per_sec, - len(prompts), + args.seed, args.incoming_requests_per_step, args.incoming_requests_per_sec, len(prompts) ) # Init requests. requests = [ Request(p, t, tokenizer, sp) - for p, t, sp in tqdm(zip(prompts, time_offsets, sampling_params_list), "init requests", total=len(prompts)) + for p, t, sp in tqdm( + zip(prompts, time_offsets, sampling_params_list), "init requests", total=len(prompts) + ) ] return requests @@ -411,19 +283,21 @@ def build_dynamic_engine_setup_prefix( # Prompt description prompt_src_str = ( - "cli" if args.prompts else - "file" if args.prompt_file else - f"synth({', '.join(map(str, args.num_tokens_to_prompt))})" + "cli" + if args.prompts + else ( + "file" + if args.prompt_file + else f"synth({', '.join(map(str, args.num_tokens_to_prompt))})" + ) ) request_str = ( - f"requests: {prompt_src_str}, " - f"n {len(requests):d}, g {args.num_tokens_to_generate:d}, " + f"requests: {prompt_src_str}, " f"n {len(requests):d}, g {args.num_tokens_to_generate:d}, " ) request_str += ( - f"dur {args.incoming_requests_duration:.1e} " - f"r/sec {args.incoming_requests_per_sec:.1e}" - if args.incoming_requests_per_step is None else - f"r/step {args.incoming_requests_per_step}" + f"dur {args.incoming_requests_duration:.1e} " f"r/sec {args.incoming_requests_per_sec:.1e}" + if args.incoming_requests_per_step is None + else f"r/step {args.incoming_requests_per_step}" ) # Buffer limits config @@ -433,14 +307,7 @@ def build_dynamic_engine_setup_prefix( f"[r {context.max_requests}, t {context.max_tokens}]" ) - parts = [ - get_model_size_str(model), - "dynamic", - cg_str, - uvm_str, - request_str, - buffer_limits_str, - ] + parts = [get_model_size_str(model), "dynamic", cg_str, uvm_str, request_str, buffer_limits_str] return " | ".join(parts) @@ -456,4 +323,4 @@ def get_global_peak_memory_stats_bytes() -> dict: t = torch.tensor([peak_alloc], device="cuda", dtype=torch.int64) torch.distributed.all_reduce(t, op=torch.distributed.ReduceOp.MAX) peak_alloc = int(t[0].item()) - return {"mem-max-allocated-bytes": peak_alloc} \ No newline at end of file + return {"mem-max-allocated-bytes": peak_alloc} diff --git a/examples/inference/t5/simple_t5_batch_inference.py b/examples/inference/t5/simple_t5_batch_inference.py index 4b15952e07f..1aca74b3176 100644 --- a/examples/inference/t5/simple_t5_batch_inference.py +++ b/examples/inference/t5/simple_t5_batch_inference.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + import os import sys from argparse import Namespace @@ -17,7 +19,7 @@ from megatron.core.inference.text_generation_controllers.encoder_decoder_text_generation_controller import ( EncoderDecoderTextGenerationController, ) -from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer +from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer from megatron.core.transformer.module import MegatronModule from pretrain_t5 import model_provider @@ -57,7 +59,7 @@ def add_text_generate_args(parser): metavar='N', type=str, nargs='+', - help='Encoder input prompts with each prompt within quotes and seperated by space', + help='Encoder input prompts with each prompt within quotes and separated by space', ) group.add_argument( "--max-batch-size", type=int, default=1, help='Max number of prompts to process at once' @@ -77,10 +79,8 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngi Returns: AbstractBackend: The chosen backend """ - if args.legacy_tokenizer: - tokenizer = get_tokenizer() - else: - tokenizer = build_tokenizer(args) + # Build tokenizer + tokenizer = build_tokenizer(args) inference_wrapper_config = InferenceWrapperConfig( hidden_size=args.hidden_size, @@ -131,10 +131,9 @@ def main(): num_tokens_to_generate=args.num_tokens_to_generate, ) - if args.legacy_tokenizer: - tokenizer = get_tokenizer() - else: - tokenizer = build_tokenizer(args) + # Build tokenizer + tokenizer = build_tokenizer(args) + decoder_prompts = [""] * len( args.encoder_prompts ) # for T5, the prompt is provided as encoder input, hence decoder_prompts is empty diff --git a/examples/llama/train_llama3_8b_h100_fp8.sh b/examples/llama/train_llama3_8b_h100_fp8.sh index f791996308e..28227546bc7 100644 --- a/examples/llama/train_llama3_8b_h100_fp8.sh +++ b/examples/llama/train_llama3_8b_h100_fp8.sh @@ -69,6 +69,7 @@ MODEL_ARGS=( --attention-dropout 0.0 --hidden-dropout 0.0 --swiglu + --normalization RMSNorm --init-method-std 0.0134 --attention-backend fused --apply-layernorm-1p diff --git a/examples/mimo/data/energon_avlm_task_encoder.py b/examples/mimo/data/energon_avlm_task_encoder.py index 32afb1b2cfb..a6a86761720 100644 --- a/examples/mimo/data/energon_avlm_task_encoder.py +++ b/examples/mimo/data/energon_avlm_task_encoder.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import argparse import logging import os @@ -39,7 +41,6 @@ ) from megatron.energon.task_encoder.base import stateless from megatron.training import get_args -from megatron.training.tokenizer.multimodal_tokenizer import mistral_custom_template IMAGE_TOKEN = "" AUDIO_TOKEN = "