From 0d018d5bbb50e81cb5e5991ab2c5b15c51d3b6d1 Mon Sep 17 00:00:00 2001 From: Amit Moryossef Date: Tue, 28 Apr 2026 09:20:49 +0000 Subject: [PATCH 1/2] Add Roh et al. 2024 on MediaPipe keypoint preprocessing for ISLR Co-Authored-By: Claude Opus 4.7 (1M context) --- src/index.md | 2 ++ src/references.bib | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/src/index.md b/src/index.md index 0fc1258e..be779b94 100644 --- a/src/index.md +++ b/src/index.md @@ -569,6 +569,8 @@ they combine the phonological annotations in ASL-LEX 2.0 [@dataset:sehyr2021asl] Interestingly, @tavella-etal-2022-wlasl construct a similar dataset aiming just for phonological property recognition in American Sign Language (ASL). @inoue-etal-2024-enhancing trained a Video-Keypoint Network (VKNet) to classify the location, movement, and handshape syllabic components of the dominant hand in Japanese Sign Language (JSL), and demonstrated that pre-training on the WLASL American Sign Language (ASL) dataset improved classification of the movement and handshape components when JSL training data was limited. +@roh-etal-2024-preprocessing focus on preprocessing MediaPipe keypoints for isolated SLR by introducing palm-anchor-based normalization to emphasize hand shape and bilinear interpolation to reconstruct undetected hand keypoints, achieving 83.26% accuracy on WLASL-100 with a Transformer encoder, the highest among pose-based approaches at the time. + #### Gloss-to-Pose Gloss-to-Pose, subsumed under the task of sign language production, is the task of producing a sequence of poses that adequately represent a sequence of signs written as gloss. diff --git a/src/references.bib b/src/references.bib index f56f0d02..5d2b8280 100644 --- a/src/references.bib +++ b/src/references.bib @@ -4545,6 +4545,15 @@ @inproceedings{zhou-etal-2024-multimodal Chen, Yuxiao and Neidle, Carol and Metaxas, Dimitris N.", +} + +@inproceedings{roh-etal-2024-preprocessing, + title = "Preprocessing Mediapipe Keypoints with Keypoint Reconstruction and Anchors for Isolated Sign Language Recognition", + author = "Roh, Kyunggeun and + Lee, Huije and + Hwang, Eui Jun and + Cho, Sukmin and + Park, Jong C.", editor = "Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and @@ -4559,3 +4568,7 @@ @inproceedings{zhou-etal-2024-multimodal url = "https://aclanthology.org/2024.signlang-1.45/", pages = "408--419" } + + url = "https://aclanthology.org/2024.signlang-1.36/", + pages = "323--334" +} From 1f1327a652fe3614846de38400b67358a92659a1 Mon Sep 17 00:00:00 2001 From: AmitMY Date: Tue, 28 Apr 2026 09:55:15 +0000 Subject: [PATCH 2/2] Drop accuracy number from roh-etal one-liner (review pattern) --- src/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index.md b/src/index.md index be779b94..7835f8db 100644 --- a/src/index.md +++ b/src/index.md @@ -569,7 +569,7 @@ they combine the phonological annotations in ASL-LEX 2.0 [@dataset:sehyr2021asl] Interestingly, @tavella-etal-2022-wlasl construct a similar dataset aiming just for phonological property recognition in American Sign Language (ASL). @inoue-etal-2024-enhancing trained a Video-Keypoint Network (VKNet) to classify the location, movement, and handshape syllabic components of the dominant hand in Japanese Sign Language (JSL), and demonstrated that pre-training on the WLASL American Sign Language (ASL) dataset improved classification of the movement and handshape components when JSL training data was limited. -@roh-etal-2024-preprocessing focus on preprocessing MediaPipe keypoints for isolated SLR by introducing palm-anchor-based normalization to emphasize hand shape and bilinear interpolation to reconstruct undetected hand keypoints, achieving 83.26% accuracy on WLASL-100 with a Transformer encoder, the highest among pose-based approaches at the time. +@roh-etal-2024-preprocessing focus on preprocessing MediaPipe keypoints for isolated SLR by introducing palm-anchor-based normalization to emphasize hand shape and bilinear interpolation to reconstruct undetected hand keypoints, achieving the highest accuracy on WLASL-100 with a Transformer encoder among pose-based approaches at the time. #### Gloss-to-Pose Gloss-to-Pose, subsumed under the task of sign language production, is the task of producing a sequence of poses that adequately represent