From 97d9e4b1e3a0e87f0c84c9ba11b856441f5e7d54 Mon Sep 17 00:00:00 2001 From: Amit Moryossef Date: Tue, 28 Apr 2026 09:18:43 +0000 Subject: [PATCH 1/2] Add Konrad et al. 2024 on Public DGS Corpus release 4 Co-Authored-By: Claude Opus 4.7 (1M context) --- src/index.md | 1 + src/references.bib | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/index.md b/src/index.md index cc117882..0685e71c 100644 --- a/src/index.md +++ b/src/index.md @@ -1142,6 +1142,7 @@ are collections of annotated single signs. They are synthesized [@dataset:ebling contain parallel sequences of signs and spoken language. Available continuous sign corpora are extremely limited, containing 4-6 orders of magnitude fewer sentence pairs than similar corpora for spoken language machine translation [@arivazhagan2019massively]. Moreover, while automatic speech recognition (ASR) datasets contain up to 50,000 hours of recordings [@pratap2020mls], the most extensive continuous sign language corpus contains only 1,150 hours, and only 50 of them are publicly available [@dataset:hanke-etal-2020-extending]. +@konrad-etal-2024-corpus describe the fourth release of the Public DGS Corpus, expanding it to 52.4 hours, adding a new iLex-based portal alongside MY DGS, MY DGS – annotated and MY DGS – ANNIS, and providing additional pose representations from MediaPipe and Apple Vision Framework, including 3D keypoint estimates. These datasets are usually synthesized [@dataset:databases2007volumes;@dataset:Crasborn2008TheCN;@dataset:ko2019neural;@dataset:hanke-etal-2020-extending] or recorded in studio conditions [@dataset:forster2014extensions;@cihan2018neural], which does not account for noise in real-life conditions. Moreover, some contain signed interpretations of spoken language rather than naturally-produced signs, which may not accurately represent native signing since translation is now a part of the discourse event. diff --git a/src/references.bib b/src/references.bib index f712af18..61364197 100644 --- a/src/references.bib +++ b/src/references.bib @@ -4171,6 +4171,7 @@ @inproceedings{petrovich2022TEMOSGeneratingDiverse year = {2022} } +<<<<<<< HEAD @inproceedings{bono-etal-2024-data, address = {Torino, Italia}, author = {Bono, Mayumi and @@ -4515,3 +4516,27 @@ @inproceedings{susman-kimmelman-2024-eye url = "https://aclanthology.org/2024.signlang-1.40/", pages = "361--369" } + +@inproceedings{konrad-etal-2024-corpus, + title = "Corpus {\`a} la carte {--} Improving Access to the {P}ublic {DGS} {C}orpus", + author = {Konrad, Reiner and + Hanke, Thomas and + Isard, Amy and + Schulder, Marc and + K{\"o}nig, Lutz and + Bleicken, Julian and + B{\"o}se, Oliver}, + editor = "Efthimiou, Eleni and + Fotinea, Stavroula-Evita and + Hanke, Thomas and + Hochgesang, Julie A. and + Mesch, Johanna and + Schulder, Marc", + booktitle = "Proceedings of the LREC-COLING 2024 11th Workshop on the Representation and Processing of Sign Languages: Evaluation of Sign Language Resources", + month = may, + year = "2024", + address = "Torino, Italia", + publisher = "ELRA and ICCL", + url = "https://aclanthology.org/2024.signlang-1.20/", + pages = "184--193" +} From ad8b34004d56615f354e6ba5dc9f438afbe90c26 Mon Sep 17 00:00:00 2001 From: AmitMY Date: Tue, 28 Apr 2026 09:55:01 +0000 Subject: [PATCH 2/2] Move konrad-etal entry to Dataset Papers (apply review pattern) --- src/index.md | 6 ++---- src/references.bib | 1 - 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/index.md b/src/index.md index 0685e71c..68020361 100644 --- a/src/index.md +++ b/src/index.md @@ -1104,9 +1104,6 @@ Some special features are cross-level links, non-temporal objects, timepoint tra 3D viewing of motion capture data and a project tool for managing whole corpora of annotation files. Anvil installation is [available](http://www.anvil-software.de/download/index.html) for Windows, macOS, and Linux. -##### MY DGS -- ANNIS Query Wizard -@isard-2024-building presented a web-based Query Wizard that guided users through the construction of valid ANNIS Query Language expressions over the Public DGS Corpus [@dataset:hanke-etal-2020-extending] by composing context-sensitive blocks for annotation tiers, metadata, and inter-tier connections, which could then be opened directly in the MY DGS -- ANNIS portal. - ##### Other {-} @battisti-etal-2024-advancing presented a transcription and annotation scheme for continuous L1 and L2 data in Swiss German Sign Language (DSGS), introducing conventions for non-manual components and L2 learner errors, and outlined an initial inter-annotator agreement validation approach. @@ -1124,6 +1121,8 @@ Research papers which do not necessarily contribute new theory or architectures @hall-etal-2024-phonological digitized and phonologically transcribed the Canadian Dictionary of ASL using the Sign Language Phonetic Annotator-Analyzer software, producing a searchable resource that captured handshape, movement, location, and relation parameters for roughly 2000 signs to enable phonologically based queries that paper-based dictionaries cannot support. +The Public DGS Corpus also saw multiple SignLang 2024 contributions: @konrad-etal-2024-corpus describe its fourth release, expanding it to 52.4 hours, adding a new iLex-based portal alongside MY DGS, MY DGS – annotated and MY DGS – ANNIS, and providing additional MediaPipe and Apple Vision Framework pose representations including 3D keypoint estimates; @isard-2024-building introduced a web-based Query Wizard that guided users through the construction of valid ANNIS Query Language expressions over the corpus by composing context-sensitive blocks for annotation tiers, metadata, and inter-tier connections. + @@ -1142,7 +1141,6 @@ are collections of annotated single signs. They are synthesized [@dataset:ebling contain parallel sequences of signs and spoken language. Available continuous sign corpora are extremely limited, containing 4-6 orders of magnitude fewer sentence pairs than similar corpora for spoken language machine translation [@arivazhagan2019massively]. Moreover, while automatic speech recognition (ASR) datasets contain up to 50,000 hours of recordings [@pratap2020mls], the most extensive continuous sign language corpus contains only 1,150 hours, and only 50 of them are publicly available [@dataset:hanke-etal-2020-extending]. -@konrad-etal-2024-corpus describe the fourth release of the Public DGS Corpus, expanding it to 52.4 hours, adding a new iLex-based portal alongside MY DGS, MY DGS – annotated and MY DGS – ANNIS, and providing additional pose representations from MediaPipe and Apple Vision Framework, including 3D keypoint estimates. These datasets are usually synthesized [@dataset:databases2007volumes;@dataset:Crasborn2008TheCN;@dataset:ko2019neural;@dataset:hanke-etal-2020-extending] or recorded in studio conditions [@dataset:forster2014extensions;@cihan2018neural], which does not account for noise in real-life conditions. Moreover, some contain signed interpretations of spoken language rather than naturally-produced signs, which may not accurately represent native signing since translation is now a part of the discourse event. diff --git a/src/references.bib b/src/references.bib index 61364197..55752ea7 100644 --- a/src/references.bib +++ b/src/references.bib @@ -4171,7 +4171,6 @@ @inproceedings{petrovich2022TEMOSGeneratingDiverse year = {2022} } -<<<<<<< HEAD @inproceedings{bono-etal-2024-data, address = {Torino, Italia}, author = {Bono, Mayumi and