Bump tokenizers submodule to fix sentencepiece GCC 15 build (#20135)

rascani · claude · web-flow · commit 8e4fe08e3ce3 · 2026-06-09T15:08:54.000-07:00
### Summary Updates extension/llm/tokenizers to include meta-pytorch/tokenizers#193, which bumps the sentencepiece submodule to pick up a missing `#include <cstdint>` (google/sentencepiece#1109). Without this, `pytorch_tokenizers` fails to compile inside the `executorch-ubuntu-26.04-gcc15` docker image, blocking the RISC-V baremetal CI (#19917). ### Test plan CI --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/examples/models/parakeet/tokenizer_utils.cpp b/examples/models/parakeet/tokenizer_utils.cpp
@@ -8,6 +8,10 @@
 
 namespace {
 
+// SentencePiece's word-boundary marker, spelled as UTF-8 bytes so this remains
+// a const char[] literal when compiled as C++20.
+constexpr char kSentencePieceWordBoundary[] = "\xE2\x96\x81";
+
 bool is_whitespace_only(const std::string& token) {
   if (token.empty()) {
     return true;
@@ -36,7 +40,7 @@ bool is_special_token(const std::string& token) {
   if (token.rfind("##", 0) == 0) {
     return true;
   }
-  if (token.rfind(u8"▁", 0) == 0) {
+  if (token.rfind(kSentencePieceWordBoundary, 0) == 0) {
     return true;
   }
   if (is_whitespace_only(token)) {
diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers
@@ -1 +1 @@
-Subproject commit b642403834a67c8ef14a7109dcd1bb5e5f3cb68a
+Subproject commit 3f98e9903e4e9972e5371522d1b64bc7793c250b