From 0b14c77699d1ca723b806bd0c6c58eb4688d188c Mon Sep 17 00:00:00 2001 From: Wadim Wawrzenczak Date: Tue, 17 Mar 2026 01:36:39 +0100 Subject: [PATCH 01/13] Update PO escaping tests to expect unescaped names Text unit names generated from msgid should be fully unescaped, matching the source. --- .../extraction/POFilterExtractionTest.java | 16 ++++++++-------- .../tm/localizeasset/LocalizePoAssetTest.java | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java b/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java index 31d826486e..b0736fcd5a 100644 --- a/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java +++ b/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java @@ -64,7 +64,7 @@ public void extractWithEscapedDoubleQuotes() throws UnsupportedAssetFilterTypeEx Assertions.assertThat(units) .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) - .containsExactly(tuple("He said \\\"hello\\\"", "He said \"hello\"")); + .containsExactly(tuple("He said \"hello\"", "He said \"hello\"")); } @Test @@ -81,7 +81,7 @@ public void extractWithEscapedSingleQuotes() throws UnsupportedAssetFilterTypeEx Assertions.assertThat(units) .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) - .containsExactly(tuple("it\\'s a test", "it's a test")); + .containsExactly(tuple("it's a test", "it's a test")); } @Test @@ -157,11 +157,11 @@ public void extractPluralsWithEscapedQuotes() throws UnsupportedAssetFilterTypeE AssetExtractorTextUnit::getPluralForm, AssetExtractorTextUnit::getPluralFormOther) .containsExactly( - tuple("the \\\"item\\\" _zero", "the \"items\"", "zero", "the \\\"item\\\" _other"), - tuple("the \\\"item\\\" _one", "the \"item\"", "one", "the \\\"item\\\" _other"), - tuple("the \\\"item\\\" _two", "the \"items\"", "two", "the \\\"item\\\" _other"), - tuple("the \\\"item\\\" _few", "the \"items\"", "few", "the \\\"item\\\" _other"), - tuple("the \\\"item\\\" _many", "the \"items\"", "many", "the \\\"item\\\" _other"), - tuple("the \\\"item\\\" _other", "the \"items\"", "other", "the \\\"item\\\" _other")); + tuple("the \"item\" _zero", "the \"items\"", "zero", "the \"item\" _other"), + tuple("the \"item\" _one", "the \"item\"", "one", "the \"item\" _other"), + tuple("the \"item\" _two", "the \"items\"", "two", "the \"item\" _other"), + tuple("the \"item\" _few", "the \"items\"", "few", "the \"item\" _other"), + tuple("the \"item\" _many", "the \"items\"", "many", "the \"item\" _other"), + tuple("the \"item\" _other", "the \"items\"", "other", "the \"item\" _other")); } } diff --git a/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java b/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java index b1d584dfeb..6771ee866f 100644 --- a/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java +++ b/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java @@ -246,7 +246,7 @@ public void testLocalizePoEscaping() throws Exception { assertEquals(1, textUnitDTOs.size()); TextUnitDTO textUnitDTO = textUnitDTOs.get(0); - assertEquals("repin \\\"{}\\\"", textUnitDTO.getName()); + assertEquals("repin \"{}\"", textUnitDTO.getName()); assertEquals("repin \"{}\"", textUnitDTO.getSource()); String localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP"); @@ -286,7 +286,7 @@ public void testLocalizePoEscaping() throws Exception { assertEquals(1, textUnitDTOs.size()); textUnitDTO = textUnitDTOs.get(0); - assertEquals("repin \\\"{}\\\"", textUnitDTO.getName()); + assertEquals("repin \"{}\"", textUnitDTO.getName()); assertEquals("repin \"{}\"", textUnitDTO.getSource()); assertEquals("repin \"{}\" jp", textUnitDTO.getTarget()); } From 232bf01b2c412a7e632df6aee9a2e937886e7d78 Mon Sep 17 00:00:00 2001 From: Wadim Wawrzenczak Date: Tue, 17 Mar 2026 11:16:29 +0100 Subject: [PATCH 02/13] Add PO escaping tests ported from Okapi Cover escape/unescape test cases handled by Okapi PO implementation to ensure that Mojito's overridden class behaves consistently. --- .../extraction/POFilterExtractionTest.java | 115 ++++++++++++ .../tm/localizeasset/LocalizePoAssetTest.java | 177 ++++++++++++++++++ 2 files changed, 292 insertions(+) diff --git a/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java b/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java index b0736fcd5a..e97588d3f8 100644 --- a/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java +++ b/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java @@ -164,4 +164,119 @@ public void extractPluralsWithEscapedQuotes() throws UnsupportedAssetFilterTypeE tuple("the \"item\" _many", "the \"items\"", "many", "the \"item\" _other"), tuple("the \"item\" _other", "the \"items\"", "other", "the \"item\" _other")); } + + @Test + public void extractWithEscapedBackslashQuoteNewlineTab() + throws UnsupportedAssetFilterTypeException { + // based on: + // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L241-251 + List units = + extract( + "test.pot", + """ + msgid "Text \\\\ and \\" and \\n and \\t" + msgstr "" + """, + null, + null); + + Assertions.assertThat(units) + .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) + .containsExactly(tuple("Text \\ and \" and \n and \t", "Text \\ and \" and \n and \t")); + } + + @Test + public void extractWithEscapedFormFeedBackspaceCarriageReturn() + throws UnsupportedAssetFilterTypeException { + // based on: + // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L253-263 + List units = + extract( + "test.pot", + """ + msgid "Text \\f and \\b and \\rr" + msgstr "" + """, + null, + null); + + Assertions.assertThat(units) + .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) + .containsExactly(tuple("Text \f and \b and \rr", "Text \f and \b and \rr")); + } + + @Test + public void extractWithEscapeAtStartOfString() throws UnsupportedAssetFilterTypeException { + // based on: + // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L265-276 + List units = + extract( + "test.pot", + """ + msgid "\\t." + msgstr "" + """, + null, + null); + + Assertions.assertThat(units) + .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) + .containsExactly(tuple("\t.", "\t.")); + } + + @Test + public void extractWithUnescapedQuoteInMiddle() throws UnsupportedAssetFilterTypeException { + // based on: + // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L280-290 + + // Verifies parity with how Okapi handles malformed escaping in PO files + + // Malformed PO: unescaped quote in middle of msgid string. + // PO Parser uses lastIndexOf('"') to find the closing delimiter, + // so given a PO snippet: + // `msgid "A " and a \"` + // the parsed string content after unescaping is: + // `A " and a \` + // (both an unescaped inner quote and the single trailing backslash are treated as literals) + List units = + extract( + "test.pot", + """ + msgid "A " and a \\" + msgstr "" + """, + null, + null); + + Assertions.assertThat(units) + .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) + .containsExactly(tuple("A \" and a \\", "A \" and a \\")); + } + + @Test + public void extractWithEscapedAndDanglingBackslash() throws UnsupportedAssetFilterTypeException { + // based on: + // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L292-303 + + // Verifies parity with how Okapi handles malformed escaping in PO files + + // PO snippet + // `msgid "\\\"` + // has 3 backslashes between quotes: escaped backslash (\\) + // followed by a dangling backslash (\) before closing quote. + List units = + extract( + "test.pot", + """ + msgid "\\\\\\" + msgstr "" + """, + null, + null); + + // After unescape: \\ → \ (one backslash), dangling \ stays → total 2 backslashes. + Assertions.assertThat(units) + .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) + .containsExactly(tuple("\\\\", "\\\\")); + } } diff --git a/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java b/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java index 6771ee866f..9a8de43320 100644 --- a/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java +++ b/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java @@ -668,4 +668,181 @@ public void testImportLocalizedAssetPoPluralSinglePlural() throws Exception { """; importTranslations(repoLocale, localizedAssetContent, StatusForEqualTarget.APPROVED); } + + @Test + public void testLocalizePoEscapedBackslashNewlineTab() throws Exception { + // based on: + // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L241-251 + + Repository repo = createRepository(); + RepositoryLocale repoLocale = addLocale(repo, "ja-JP"); + + String assetContent = + """ + msgid "" + msgstr "" + "Project-Id-Version: PACKAGE VERSION\\n" + "Report-Msgid-Bugs-To: \\n" + "POT-Creation-Date: 2017-09-15 11:53-0500\\n" + "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" + "Last-Translator: FULL NAME \\n" + "Language-Team: LANGUAGE \\n" + "MIME-Version: 1.0\\n" + "Plural-Forms: nplurals=2; plural=(n != 1);\\n" + "Content-Type: text/plain; charset=utf-8\\n" + "Content-Transfer-Encoding: 8bit\\n" + #. Comments + #: src/main.py:10 + msgid "Text \\\\ and \\" and \\n and \\t" + msgstr "" + """; + + String expectedLocalizedAsset = + """ + msgid "" + msgstr "" + "Project-Id-Version: PACKAGE VERSION\\n" + "Report-Msgid-Bugs-To: \\n" + "POT-Creation-Date: 2017-09-15 11:53-0500\\n" + "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" + "Last-Translator: FULL NAME \\n" + "Language-Team: LANGUAGE \\n" + "MIME-Version: 1.0\\n" + "Plural-Forms: nplurals=1; plural=0;\\n" + "Content-Type: text/plain; charset=utf-8\\n" + "Content-Transfer-Encoding: 8bit\\n" + #. Comments + #: src/main.py:10 + msgid "Text \\\\ and \\" and \\n and \\t" + msgstr "Text \\\\ and \\" and \\n and \\t" + """; + + createAsset(repo, "messages.pot", assetContent); + processAsset(repo, assetContent); + + List textUnitDTOs = searchTextUnits(repo); + + assertEquals(1, textUnitDTOs.size()); + TextUnitDTO textUnitDTO = textUnitDTOs.get(0); + assertEquals("Text \\ and \" and \n and \t", textUnitDTO.getName()); + assertEquals("Text \\ and \" and \n and \t", textUnitDTO.getSource()); + + String localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP"); + logger.debug("localized=\n{}", localizedAsset); + assertEquals(expectedLocalizedAsset, localizedAsset); + + String forImport = + """ + msgid "" + msgstr "" + "Project-Id-Version: PACKAGE VERSION\\n" + "Report-Msgid-Bugs-To: \\n" + "POT-Creation-Date: 2017-09-15 11:53-0500\\n" + "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" + "Last-Translator: FULL NAME \\n" + "Language-Team: LANGUAGE \\n" + "MIME-Version: 1.0\\n" + "Plural-Forms: nplurals=1; plural=0;\\n" + "Content-Type: text/plain; charset=utf-8\\n" + "Content-Transfer-Encoding: 8bit\\n" + #. Comments + #: src/main.py:10 + msgid "Text \\\\ and \\" and \\n and \\t" + msgstr "Text \\\\ and \\" and \\n and \\t jp" + """; + + importTranslations(repoLocale, forImport, StatusForEqualTarget.TRANSLATION_NEEDED); + + localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP"); + logger.debug("localized after import=\n{}", localizedAsset); + assertEquals(forImport, localizedAsset); + + TextUnitSearcherParameters textUnitSearcherParameters = new TextUnitSearcherParameters(); + textUnitSearcherParameters.setRepositoryIds(repo.getId()); + textUnitSearcherParameters.setStatusFilter(StatusFilter.TRANSLATED); + textUnitSearcherParameters.setLocaleId(repoLocale.getLocale().getId()); + textUnitDTOs = textUnitSearcher.search(textUnitSearcherParameters); + + assertEquals(1, textUnitDTOs.size()); + textUnitDTO = textUnitDTOs.get(0); + assertEquals("Text \\ and \" and \n and \t", textUnitDTO.getName()); + assertEquals("Text \\ and \" and \n and \t", textUnitDTO.getSource()); + assertEquals("Text \\ and \" and \n and \t jp", textUnitDTO.getTarget()); + } + + @Test + public void testLocalizePoUnescapedRewrite() throws Exception { + // based on: + // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L308-312 + + // Verifies parity with how Okapi handles malformed escaping in PO files + + Repository repo = createRepository(); + RepositoryLocale repoLocale = addLocale(repo, "ja-JP"); + + // Malformed PO: unescaped quote in middle of msgid string. + // PO Parser uses lastIndexOf('"') to find the closing delimiter, + // so given a PO snippet: + // `msgid "A " and a \"` + // the parsed string content after unescaping is: + // `A " and a \` + // (both an unescaped inner quote and the single trailing backslash are treated as literals) + String assetContent = + """ + msgid "" + msgstr "" + "Project-Id-Version: PACKAGE VERSION\\n" + "Report-Msgid-Bugs-To: \\n" + "POT-Creation-Date: 2017-09-15 11:53-0500\\n" + "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" + "Last-Translator: FULL NAME \\n" + "Language-Team: LANGUAGE \\n" + "MIME-Version: 1.0\\n" + "Plural-Forms: nplurals=2; plural=(n != 1);\\n" + "Content-Type: text/plain; charset=utf-8\\n" + "Content-Transfer-Encoding: 8bit\\n" + msgid "A " and a \\" + msgstr "" + """; + + // On localization (source copied to target), encoder should properly re-escape: + // `msgstr "A \" and a \\" + String expectedLocalizedAsset = + """ + msgid "" + msgstr "" + "Project-Id-Version: PACKAGE VERSION\\n" + "Report-Msgid-Bugs-To: \\n" + "POT-Creation-Date: 2017-09-15 11:53-0500\\n" + "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" + "Last-Translator: FULL NAME \\n" + "Language-Team: LANGUAGE \\n" + "MIME-Version: 1.0\\n" + "Plural-Forms: nplurals=1; plural=0;\\n" + "Content-Type: text/plain; charset=utf-8\\n" + "Content-Transfer-Encoding: 8bit\\n" + msgid "A " and a \\" + msgstr "A \\" and a \\\\" + """; + + createAsset(repo, "messages.pot", assetContent); + processAsset(repo, assetContent); + + List textUnitDTOs = searchTextUnits(repo); + + assertEquals(1, textUnitDTOs.size()); + TextUnitDTO textUnitDTO = textUnitDTOs.get(0); + assertEquals( + """ + A " and a \\""", + textUnitDTO.getName()); + assertEquals( + """ + A " and a \\""", + textUnitDTO.getSource()); + + String localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP"); + logger.debug("localized=\n{}", localizedAsset); + assertEquals(expectedLocalizedAsset, localizedAsset); + } } From 426f4b98cd204d39610f1399d4004d20db4ec223 Mon Sep 17 00:00:00 2001 From: Wadim Wawrzenczak Date: Tue, 17 Mar 2026 11:23:12 +0100 Subject: [PATCH 03/13] Add tests for unescaping roundtrip asymmetry in PO Source strings from PO files would be doubly unescaped in the extraction, while target strings would not be escaped at all when generating localized assets. --- .../extraction/POFilterExtractionTest.java | 33 +++++ .../tm/localizeasset/LocalizePoAssetTest.java | 123 ++++++++++++++++++ 2 files changed, 156 insertions(+) diff --git a/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java b/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java index e97588d3f8..1f7cee94c0 100644 --- a/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java +++ b/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java @@ -279,4 +279,37 @@ public void extractWithEscapedAndDanglingBackslash() throws UnsupportedAssetFilt .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) .containsExactly(tuple("\\\\", "\\\\")); } + + @Test + public void extractWithNoDoubleUnescaping() throws UnsupportedAssetFilterTypeException { + List units = + extract( + "test.pot", + // Actual source string in the localized application UI is: + // `Name cannot contain "/", "\", or characters outside the basic multilingual plane.` + // In PO file content, each double quote and backslash is escaped with another + // backslash. + // `msgid "Name cannot contain \"/\", \"\\\", or characters outside the basic + // multilingual plane."` + // Finally, in Java text block we must escape each backslash additionally + """ + msgid "Name cannot contain \\"/\\", \\"\\\\\\", or characters outside the basic multilingual plane." + msgstr "" + """, + null, + null); + + Assertions.assertThat(units) + .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) + .containsExactly( + // Mojito should unescape correctly matching the actual UI string + // `Name cannot contain "/", "\", or characters outside the basic multilingual plane.` + tuple( + """ + Name cannot contain "/", "\\", or characters outside the basic multilingual plane.\ + """, + """ + Name cannot contain "/", "\\", or characters outside the basic multilingual plane.\ + """)); + } } diff --git a/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java b/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java index 9a8de43320..beef4702c5 100644 --- a/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java +++ b/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java @@ -845,4 +845,127 @@ public void testLocalizePoUnescapedRewrite() throws Exception { logger.debug("localized=\n{}", localizedAsset); assertEquals(expectedLocalizedAsset, localizedAsset); } + + @Test + public void testLocalizePoEscapingSymmetry() throws Exception { + + Repository repo = createRepository(); + RepositoryLocale repoLocale = addLocale(repo, "ja-JP"); + + // Actual source string in the localized application UI is: + // `Name cannot contain "/", "\", or characters outside the basic multilingual plane.` + // `msgid "Name cannot contain \"/\", \"\\\", or characters outside the basic multilingual + // plane."` + // Finally, in Java text block we must escape each backslash additionally + + String assetContent = + """ + msgid "" + msgstr "" + "Project-Id-Version: PACKAGE VERSION\\n" + "Report-Msgid-Bugs-To: \\n" + "POT-Creation-Date: 2017-09-15 11:53-0500\\n" + "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" + "Last-Translator: FULL NAME \\n" + "Language-Team: LANGUAGE \\n" + "MIME-Version: 1.0\\n" + "Plural-Forms: nplurals=2; plural=(n != 1);\\n" + "Content-Type: text/plain; charset=utf-8\\n" + "Content-Transfer-Encoding: 8bit\\n" + #: src/main.py:10 + msgid "Name cannot contain \\"/\\", \\"\\\\\\", or characters outside the basic multilingual plane." + msgstr "" + """; + + // Target string backfilled from source must match the source exactly, + // i.e. the unescaping when parsing and re-escaping when generating a localized asset + // must be symmetric + String expectedLocalizedAsset = + """ + msgid "" + msgstr "" + "Project-Id-Version: PACKAGE VERSION\\n" + "Report-Msgid-Bugs-To: \\n" + "POT-Creation-Date: 2017-09-15 11:53-0500\\n" + "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" + "Last-Translator: FULL NAME \\n" + "Language-Team: LANGUAGE \\n" + "MIME-Version: 1.0\\n" + "Plural-Forms: nplurals=1; plural=0;\\n" + "Content-Type: text/plain; charset=utf-8\\n" + "Content-Transfer-Encoding: 8bit\\n" + #: src/main.py:10 + msgid "Name cannot contain \\"/\\", \\"\\\\\\", or characters outside the basic multilingual plane." + msgstr "Name cannot contain \\"/\\", \\"\\\\\\", or characters outside the basic multilingual plane." + """; + + createAsset(repo, "messages.pot", assetContent); + processAsset(repo, assetContent); + + List textUnitDTOs = searchTextUnits(repo); + + assertEquals(1, textUnitDTOs.size()); + TextUnitDTO textUnitDTO = textUnitDTOs.get(0); + assertEquals( + """ + Name cannot contain "/", "\\", or characters outside the basic multilingual plane.""", + textUnitDTO.getName()); + assertEquals( + """ + Name cannot contain "/", "\\", or characters outside the basic multilingual plane.""", + textUnitDTO.getSource()); + + // Verify source is copied to target with proper re-escaping + String localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP"); + logger.debug("localized=\n{}", localizedAsset); + assertEquals(expectedLocalizedAsset, localizedAsset); + + // Import a translation where the target has a literal backslash. + // `Nazwa nie może zawierać „/”, „\”, ani znaków spoza podstawowej klawiatury wielojęzycznej.` + // (Note that in Polish translation those are different quote characters which don't need + // escaping) + // Translator should be able to type a literal target string in Mojito UI + // which should then be correctly escaped when generating a localized asset + // In PO file content, the backslash is escaped with another backslash: + // `msgstr "Nazwa nie może zawierać „/”, „\\\\”, ani znaków spoza podstawowej klawiatury + // wielojęzycznej."` + String forImport = + """ + msgid "" + msgstr "" + "Project-Id-Version: PACKAGE VERSION\\n" + "Report-Msgid-Bugs-To: \\n" + "POT-Creation-Date: 2017-09-15 11:53-0500\\n" + "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" + "Last-Translator: FULL NAME \\n" + "Language-Team: LANGUAGE \\n" + "MIME-Version: 1.0\\n" + "Plural-Forms: nplurals=1; plural=0;\\n" + "Content-Type: text/plain; charset=utf-8\\n" + "Content-Transfer-Encoding: 8bit\\n" + #: src/main.py:10 + msgid "Name cannot contain \\"/\\", \\"\\\\\\", or characters outside the basic multilingual plane." + msgstr "Nazwa nie może zawierać „/”, „\\\\”, ani znaków spoza podstawowej klawiatury wielojęzycznej." + """; + + importTranslations(repoLocale, forImport, StatusForEqualTarget.TRANSLATION_NEEDED); + + TextUnitSearcherParameters textUnitSearcherParameters = new TextUnitSearcherParameters(); + textUnitSearcherParameters.setRepositoryIds(repo.getId()); + textUnitSearcherParameters.setStatusFilter(StatusFilter.TRANSLATED); + textUnitSearcherParameters.setLocaleId(repoLocale.getLocale().getId()); + textUnitDTOs = textUnitSearcher.search(textUnitSearcherParameters); + + assertEquals(1, textUnitDTOs.size()); + textUnitDTO = textUnitDTOs.get(0); + // TM should store the unescaped target (single backslash) + assertEquals( + "Nazwa nie może zawierać „/”, „\\”, ani znaków spoza podstawowej klawiatury wielojęzycznej.", + textUnitDTO.getTarget()); + + // Re-generate localized PO — encoder should re-escape \ to \\ + localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP"); + logger.debug("localized after import=\n{}", localizedAsset); + assertEquals(forImport, localizedAsset); + } } From 84c022bdef11efa6d1c5d6d113cc38d53b0f4bc0 Mon Sep 17 00:00:00 2001 From: Wadim Wawrzenczak Date: Wed, 18 Mar 2026 02:24:41 +0100 Subject: [PATCH 04/13] Remove reflection tests --- .../l10n/mojito/okapi/filters/POFilterTest.java | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/common/src/test/java/com/box/l10n/mojito/okapi/filters/POFilterTest.java b/common/src/test/java/com/box/l10n/mojito/okapi/filters/POFilterTest.java index c72a1f13d5..e98ebeca0a 100644 --- a/common/src/test/java/com/box/l10n/mojito/okapi/filters/POFilterTest.java +++ b/common/src/test/java/com/box/l10n/mojito/okapi/filters/POFilterTest.java @@ -1,7 +1,6 @@ package com.box.l10n.mojito.okapi.filters; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNull; import java.util.ArrayList; import java.util.List; @@ -13,20 +12,6 @@ */ public class POFilterTest { - @Test - public void loadMsgIDPluralFromParent() { - POFilter poFilter = new POFilter(); - poFilter.loadMsgIDPluralFromParent(); - assertNull(poFilter.msgIDPlural); - } - - @Test - public void loadMsgIDFromParent() { - POFilter poFilter = new POFilter(); - poFilter.loadMsgIDFromParent(); - assertNull(poFilter.msgID); - } - @Test public void getUsagesFromSkeleton() { String skeleton = From 88cee79c7860a7c18c6e31e221a33edfba311336 Mon Sep 17 00:00:00 2001 From: Wadim Wawrzenczak Date: Wed, 18 Mar 2026 17:05:20 +0100 Subject: [PATCH 05/13] Update test for single quote escape to match Okapi behavior --- .../okapi/filters/extraction/POFilterExtractionTest.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java b/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java index 1f7cee94c0..dc34e3f2e5 100644 --- a/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java +++ b/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java @@ -68,7 +68,9 @@ public void extractWithEscapedDoubleQuotes() throws UnsupportedAssetFilterTypeEx } @Test - public void extractWithEscapedSingleQuotes() throws UnsupportedAssetFilterTypeException { + public void extractWithBackslashAndSingleQuote() throws UnsupportedAssetFilterTypeException { + // \' is not a standard PO escape sequence (PO uses double-quote delimiters), + // so the backslash is preserved as a literal character by Okapi's parser. List units = extract( "test.pot", @@ -81,7 +83,7 @@ public void extractWithEscapedSingleQuotes() throws UnsupportedAssetFilterTypeEx Assertions.assertThat(units) .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) - .containsExactly(tuple("it's a test", "it's a test")); + .containsExactly(tuple("it\\'s a test", "it\\'s a test")); } @Test From 40a951ec27b57918602f1e368465645374d3e8db Mon Sep 17 00:00:00 2001 From: Wadim Wawrzenczak Date: Tue, 24 Mar 2026 23:07:20 +0100 Subject: [PATCH 06/13] Remove custom EncoderManager/POEncoder override Drop the getEncoderManager() override that mapped PO output to Mojito's custom POEncoder. The SimpleEncoder-based POEncoder missed re-escaping of backslashes, tabs and formfeeds. All encoding now flows through Okapi's builtin PO encoder. --- .../box/l10n/mojito/okapi/filters/POFilter.java | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java index 73f8670851..047d1511b6 100644 --- a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java +++ b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java @@ -15,8 +15,6 @@ import java.util.regex.Pattern; import net.sf.okapi.common.Event; import net.sf.okapi.common.LocaleId; -import net.sf.okapi.common.MimeTypeMapper; -import net.sf.okapi.common.encoder.EncoderManager; import net.sf.okapi.common.filters.FilterConfiguration; import net.sf.okapi.common.resource.DocumentPart; import net.sf.okapi.common.resource.ITextUnit; @@ -75,8 +73,6 @@ public class POFilter extends net.sf.okapi.filters.po.POFilter { Integer poPluralForm; - EncoderManager encoderManager; - @Override public String getName() { return FILTER_CONFIG_ID; @@ -425,16 +421,6 @@ Set getUsagesFromSkeleton(String skeleton) { return locations; } - @Override - public EncoderManager getEncoderManager() { - if (encoderManager == null) { - encoderManager = new EncoderManager(); - encoderManager.setMapping( - MimeTypeMapper.PO_MIME_TYPE, "com.box.l10n.mojito.okapi.filters.POEncoder"); - } - return encoderManager; - } - static String removeUntranslated(String poFile) { StringBuilder poFileWithoutUntranslatedPlural = new StringBuilder(); Scanner scanner = new Scanner(poFile); From 2acb0faef7f653e96e79b0be45b35c4db352726a Mon Sep 17 00:00:00 2001 From: Wadim Wawrzenczak Date: Tue, 24 Mar 2026 23:07:58 +0100 Subject: [PATCH 07/13] Remove custom quote unescaping (UnescapeUtils) Drop UnescapeUtils.replaceEscapedQuotes which was applied on top of Okapi's already-unescaped content, causing double-unescaping bugs. Okapi's builtin PO parser already handles quote unescaping correctly, so the extra pass was both redundant and harmful. --- .../l10n/mojito/okapi/filters/POFilter.java | 38 +------------------ 1 file changed, 2 insertions(+), 36 deletions(-) diff --git a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java index 047d1511b6..bcfe27e0b4 100644 --- a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java +++ b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java @@ -1,6 +1,5 @@ package com.box.l10n.mojito.okapi.filters; -import com.box.l10n.mojito.okapi.TextUnitUtils; import com.box.l10n.mojito.okapi.steps.OutputDocumentPostProcessingAnnotation; import com.box.l10n.mojito.po.PoPluralRule; import com.google.common.collect.Multimap; @@ -26,7 +25,6 @@ import net.sf.okapi.common.skeleton.GenericSkeletonPart; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Configurable; import org.springframework.util.ReflectionUtils; @@ -55,10 +53,6 @@ public class POFilter extends net.sf.okapi.filters.po.POFilter { // Consider cleanup but that not that simple as it would require migrating data... static final String PLURAL_SEPARATOR = " _"; - @Autowired TextUnitUtils textUnitUtils; - - @Autowired UnescapeUtils unescapeUtils; - List eventQueue = new ArrayList<>(); LocaleId targetLocale; @@ -152,32 +146,10 @@ void processTextUnit(Event event) { if (event != null && event.isTextUnit()) { TextUnit textUnit = (TextUnit) event.getTextUnit(); renameTextUnitWithSourceAndContent(textUnit); - unescpae(textUnit); addUsagesToTextUnit(textUnit); } } - void unescpae(TextUnit textUnit) { - unescapeSource(textUnit); - unescapeTarget(textUnit); - } - - void unescapeSource(TextUnit textUnit) { - String sourceString = textUnitUtils.getSourceAsString(textUnit); - String unescapedSourceString = unescapeUtils.replaceEscapedQuotes(sourceString); - textUnitUtils.replaceSourceString(textUnit, unescapedSourceString); - } - - void unescapeTarget(TextUnit textUnit) { - TextContainer target = textUnit.getTarget(targetLocale); - if (target != null) { - String targetString = target.toString(); - String unescapedTargetString = unescapeUtils.replaceEscapedQuotes(targetString); - TextContainer newTarget = new TextContainer(unescapedTargetString); - textUnit.setTarget(targetLocale, newTarget); - } - } - boolean isPluralGroupStarting(Event event) { return event != null && event.isStartGroup() @@ -281,18 +253,12 @@ public List getCompletedForms(LocaleId localeId) { @Override void adaptTextUnitToCLDRForm(ITextUnit textUnit, String cldrPluralForm) { - if ("one".equals(cldrPluralForm)) { - // source should always be singular form for "one" form, - // this is needed for language with 6 entry like arabic logger.debug("Set message singular: {}", msgID); - textUnit.setSource(new TextContainer(unescapeUtils.replaceEscapedQuotes(msgID))); + textUnit.setSource(new TextContainer(msgID)); } else { - // source should always be plural form unless for "one" form, - // this is needed for language with only one entry like - // japanese: [0] --> other logger.debug("Set message plural: {}", msgIDPlural); - textUnit.setSource(new TextContainer(unescapeUtils.replaceEscapedQuotes(msgIDPlural))); + textUnit.setSource(new TextContainer(msgIDPlural)); } } From 5d025d95192b5272e5f2afd5cf2b5fe4505881d7 Mon Sep 17 00:00:00 2001 From: Wadim Wawrzenczak Date: Tue, 24 Mar 2026 23:09:49 +0100 Subject: [PATCH 08/13] Simplify event loop structure in POFilter Flatten the indirect dispatch chain (next -> readNextEvents -> getNextWithProcess -> processTextUnit) into a single next() that calls super.next() directly and handles each event type inline. Replace the recursive readPlurals() with processNextPluralGroup() which drains text units in a simple while-until-END_GROUP loop instead of tracking state across recursive calls. Remove intermediate methods: readNextEvents, getNextWithProcess, processTextUnit, isPluralGroupEnding, readPlurals, adaptPlurals, setUsagesOnTextUnits. Reflection-based field loading (msgID, msgIDPlural, poPluralForm) and renameTextUnitWithSourceAndContent are kept unchanged. --- .../l10n/mojito/okapi/filters/POFilter.java | 118 +++++++----------- 1 file changed, 45 insertions(+), 73 deletions(-) diff --git a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java index bcfe27e0b4..623043f54c 100644 --- a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java +++ b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java @@ -117,111 +117,83 @@ public boolean hasNext() { @Override public Event next() { - Event event; - - if (eventQueue.isEmpty()) { - readNextEvents(); + if (!eventQueue.isEmpty()) { + return eventQueue.remove(0); } - event = eventQueue.remove(0); - - return event; - } + Event event = super.next(); + loadMsgIDFromParent(); - void readNextEvents() { - Event next = getNextWithProcess(); + if (isPluralGroupStarting(event)) { + processNextPluralGroup(event); + return eventQueue.remove(0); + } - if (isPluralGroupStarting(next)) { - poPluralForm = 0; - readPlurals(next); - } else { - if (next.isDocumentPart()) { - rewritePluralFormInHeader(next.getDocumentPart()); - } - eventQueue.add(next); + if (event.isDocumentPart()) { + rewritePluralFormInHeader(event.getDocumentPart()); } - } - void processTextUnit(Event event) { - if (event != null && event.isTextUnit()) { + if (event.isTextUnit()) { TextUnit textUnit = (TextUnit) event.getTextUnit(); renameTextUnitWithSourceAndContent(textUnit); addUsagesToTextUnit(textUnit); } + + return event; } - boolean isPluralGroupStarting(Event event) { + private boolean isPluralGroupStarting(Event event) { return event != null && event.isStartGroup() && "x-gettext-plurals".equals(event.getStartGroup().getType()); } - boolean isPluralGroupEnding(Event event) { - return poPluralForm != null && event != null && event.isEndGroup(); - } - - Event getNextWithProcess() { - Event next = super.next(); - loadMsgIDFromParent(); - processTextUnit(next); - return next; - } - - void readPlurals(Event next) { - - logger.debug("First event is the start group, load msgidplural from parent and move to next"); + /** + * Consumes a complete plural group from the parent filter (START_GROUP -> TEXT_UNIT* -> + * END_GROUP), remaps PO form indices to CLDR plural forms, completes any missing forms for the + * target locale, and buffers all resulting events in {@link #eventQueue}. + */ + void processNextPluralGroup(Event startGroupEvent) { Set usagesFromSkeleton = - getUsagesFromSkeleton(next.getStartGroup().getSkeleton().toString()); + getUsagesFromSkeleton(startGroupEvent.getStartGroup().getSkeleton().toString()); loadMsgIDPluralFromParent(); - eventQueue.add(next); - - List pluralEvents = new ArrayList<>(); - next = getNextWithProcess(); + eventQueue.add(startGroupEvent); - // add the start event - pluralEvents.add(next); + List textUnitEvents = new ArrayList<>(); + poPluralForm = 0; - poPluralForm++; - next = getNextWithProcess(); + Event next = super.next(); + loadMsgIDFromParent(); - // read others until the end - while (next != null && !isPluralGroupEnding(next)) { - pluralEvents.add(next); - poPluralForm++; - next = getNextWithProcess(); + while (next != null && !next.isEndGroup()) { + if (next.isTextUnit()) { + TextUnit textUnit = (TextUnit) next.getTextUnit(); + renameTextUnitWithSourceAndContent(textUnit); + textUnitEvents.add(next); + poPluralForm++; + } + next = super.next(); + loadMsgIDFromParent(); } poPluralForm = null; - // that doesn't contain last - pluralEvents = adaptPlurals(pluralEvents, usagesFromSkeleton); - - eventQueue.addAll(pluralEvents); - - if (isPluralGroupStarting(next)) { - poPluralForm = 0; - readPlurals(next); - } else { - eventQueue.add(next); - } - } - - List adaptPlurals(List pluralEvents, Set usagesFromSkeleton) { - logger.debug("Adapt plural forms if needed"); PluralsHolder pluralsHolder = new PoPluralsHolder(); - pluralsHolder.loadEvents(pluralEvents); + pluralsHolder.loadEvents(textUnitEvents); List completedForms = pluralsHolder.getCompletedForms(targetLocale); - setUsagesOnTextUnits(completedForms, usagesFromSkeleton); - return completedForms; - } - private void setUsagesOnTextUnits(List pluralEvents, Set usagesFromSkeleton) { - for (Event pluralEvent : pluralEvents) { - if (pluralEvent.isTextUnit()) { - setUsagesAnnotationOnTextUnit(usagesFromSkeleton, pluralEvent.getTextUnit()); + for (Event e : completedForms) { + if (e.isTextUnit()) { + setUsagesAnnotationOnTextUnit(usagesFromSkeleton, e.getTextUnit()); } } + + eventQueue.addAll(completedForms); + + if (next != null) { + eventQueue.add(next); + } } private void setUsagesAnnotationOnTextUnit(Set usagesFromSkeleton, ITextUnit textUnit) { From 813dea7728fd4d9903e54a8aa19e52e23b45d19e Mon Sep 17 00:00:00 2001 From: Wadim Wawrzenczak Date: Tue, 24 Mar 2026 23:10:54 +0100 Subject: [PATCH 09/13] Simplify text unit name handling in POFilter Split renameTextUnitWithSourceAndContent into two focused methods: - setTextUnitName(textUnit, baseName): sets the base name + msgctxt - appendPluralFormToName(textUnit, cldrForm): appends the plural suffix In processNextPluralGroup, use a local loop index to derive the CLDR form instead of the stateful poPluralForm field. This removes the poPluralForm field entirely and decouples plural suffix logic from the base naming method. --- .../l10n/mojito/okapi/filters/POFilter.java | 49 ++++++++++--------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java index 623043f54c..677bd7c71d 100644 --- a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java +++ b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java @@ -65,8 +65,6 @@ public class POFilter extends net.sf.okapi.filters.po.POFilter { String msgID; - Integer poPluralForm; - @Override public String getName() { return FILTER_CONFIG_ID; @@ -135,7 +133,7 @@ public Event next() { if (event.isTextUnit()) { TextUnit textUnit = (TextUnit) event.getTextUnit(); - renameTextUnitWithSourceAndContent(textUnit); + setTextUnitName(textUnit, msgID); addUsagesToTextUnit(textUnit); } @@ -161,24 +159,26 @@ void processNextPluralGroup(Event startGroupEvent) { eventQueue.add(startGroupEvent); List textUnitEvents = new ArrayList<>(); - poPluralForm = 0; + int poFormIndex = 0; Event next = super.next(); loadMsgIDFromParent(); while (next != null && !next.isEndGroup()) { if (next.isTextUnit()) { - TextUnit textUnit = (TextUnit) next.getTextUnit(); - renameTextUnitWithSourceAndContent(textUnit); + ITextUnit textUnit = next.getTextUnit(); + setTextUnitName(textUnit, msgID); + String cldrForm = poPluralRule.poFormToCldrForm(Integer.toString(poFormIndex)); + if (cldrForm != null) { + appendPluralFormToName(textUnit, cldrForm); + } textUnitEvents.add(next); - poPluralForm++; + poFormIndex++; } next = super.next(); loadMsgIDFromParent(); } - poPluralForm = null; - PluralsHolder pluralsHolder = new PoPluralsHolder(); pluralsHolder.loadEvents(textUnitEvents); List completedForms = pluralsHolder.getCompletedForms(targetLocale); @@ -282,33 +282,34 @@ void retainForms(LocaleId localeId, List pluralForms) { } /** - * If context is present, add it to the text unit name. We keep the generated ID by Okapi for - * prefix of the text unit name allows to distinguish plural form easily. + * Sets the text unit name to the given base name, appending the msgctxt when present. Examples: * - *

Note: Decided not to go with empty string id only based the message context to have more - * consistent IDs and support plural forms. It has a little draw back when searching for string - * though as it prevents exact match on context. - * - * @param textUnit + *

    + *
  • {@code "Hello"} — simple entry + *
  • {@code "File --- menu"} — with msgctxt "menu" + *
*/ - void renameTextUnitWithSourceAndContent(ITextUnit textUnit) { - + void setTextUnitName(ITextUnit textUnit, String baseName) { Property property = textUnit.getProperty(POFilter.PROPERTY_CONTEXT); - StringBuilder newName = new StringBuilder(msgID); + StringBuilder newName = new StringBuilder(baseName); if (property != null) { newName.append(" --- ").append(property.getValue()); } - if (poPluralForm != null) { - String cldrForm = poPluralRule.poFormToCldrForm(Integer.toString(poPluralForm)); - newName.append(PLURAL_SEPARATOR).append(cldrForm); - } - textUnit.setName(newName.toString()); } + /** + * Appends a CLDR plural form suffix to the text unit's current name, e.g. {@code "item"} becomes + * {@code "item _other"}. The suffix must be set before {@link PluralsHolder#loadEvents} so that + * {@link PluralsHolder#getCldrPluralFormOfEvent} can parse the form back from the name. + */ + void appendPluralFormToName(ITextUnit textUnit, String cldrPluralForm) { + textUnit.setName(textUnit.getName() + PLURAL_SEPARATOR + cldrPluralForm); + } + /** * Rewrite the plural forms if processing a target locale. * From 33ada2ae3cb186ed097613430fc5405d87b0157f Mon Sep 17 00:00:00 2001 From: Wadim Wawrzenczak Date: Tue, 24 Mar 2026 23:12:12 +0100 Subject: [PATCH 10/13] Remove reflection from POFilter; capture sources locally Remove the msgID/msgIDPlural fields and the reflection helpers (loadMsgIDFromParent, loadMsgIDPluralFromParent, makeAccessibleAndGetString) that loaded them from the parent Okapi POFilter on every event. For non-plural text units, derive the base name directly from the text unit's own source (already unescaped by Okapi). For plural groups, capture singularSource from the first text unit and pluralSource from the second text unit within processNextPluralGroup. Pass both to PoPluralsHolder via constructor parameters so that adaptTextUnitToCLDRForm uses locally-captured values instead of mutable outer-class state. Note: when nPlurals=1 (e.g. Japanese) there is no second text unit, so pluralSource falls back to singularSource. This is addressed in the next commit. --- .../l10n/mojito/okapi/filters/POFilter.java | 85 ++++++++----------- 1 file changed, 34 insertions(+), 51 deletions(-) diff --git a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java index 677bd7c71d..fe4f23cbc4 100644 --- a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java +++ b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java @@ -3,7 +3,6 @@ import com.box.l10n.mojito.okapi.steps.OutputDocumentPostProcessingAnnotation; import com.box.l10n.mojito.po.PoPluralRule; import com.google.common.collect.Multimap; -import java.lang.reflect.Field; import java.util.ArrayList; import java.util.LinkedHashSet; import java.util.List; @@ -26,7 +25,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Configurable; -import org.springframework.util.ReflectionUtils; /** * Extends {@link net.sf.okapi.filters.po.POFilter} to somehow support gettext @@ -61,10 +59,6 @@ public class POFilter extends net.sf.okapi.filters.po.POFilter { boolean hasCopyFormsOnImport = false; - String msgIDPlural; - - String msgID; - @Override public String getName() { return FILTER_CONFIG_ID; @@ -120,7 +114,6 @@ public Event next() { } Event event = super.next(); - loadMsgIDFromParent(); if (isPluralGroupStarting(event)) { processNextPluralGroup(event); @@ -133,7 +126,7 @@ public Event next() { if (event.isTextUnit()) { TextUnit textUnit = (TextUnit) event.getTextUnit(); - setTextUnitName(textUnit, msgID); + setTextUnitName(textUnit, textUnit.getSource().toString()); addUsagesToTextUnit(textUnit); } @@ -155,31 +148,37 @@ void processNextPluralGroup(Event startGroupEvent) { Set usagesFromSkeleton = getUsagesFromSkeleton(startGroupEvent.getStartGroup().getSkeleton().toString()); - loadMsgIDPluralFromParent(); eventQueue.add(startGroupEvent); List textUnitEvents = new ArrayList<>(); - int poFormIndex = 0; - Event next = super.next(); - loadMsgIDFromParent(); - while (next != null && !next.isEndGroup()) { if (next.isTextUnit()) { - ITextUnit textUnit = next.getTextUnit(); - setTextUnitName(textUnit, msgID); - String cldrForm = poPluralRule.poFormToCldrForm(Integer.toString(poFormIndex)); - if (cldrForm != null) { - appendPluralFormToName(textUnit, cldrForm); - } textUnitEvents.add(next); - poFormIndex++; } next = super.next(); - loadMsgIDFromParent(); } - PluralsHolder pluralsHolder = new PoPluralsHolder(); + String singularSource = textUnitEvents.get(0).getTextUnit().getSource().toString(); + + // When nPlurals >= 2 the second text unit's source is the msgid_plural. + // When nPlurals == 1 (e.g. Japanese) there is only one text unit — the next commit + // adds extractPluralSourceFromSkeleton to handle that edge case correctly. + String pluralSource = + textUnitEvents.size() > 1 + ? textUnitEvents.get(1).getTextUnit().getSource().toString() + : singularSource; + + for (int i = 0; i < textUnitEvents.size(); i++) { + ITextUnit textUnit = textUnitEvents.get(i).getTextUnit(); + setTextUnitName(textUnit, singularSource); + String cldrForm = poPluralRule.poFormToCldrForm(Integer.toString(i)); + if (cldrForm != null) { + appendPluralFormToName(textUnit, cldrForm); + } + } + + PluralsHolder pluralsHolder = new PoPluralsHolder(singularSource, pluralSource); pluralsHolder.loadEvents(textUnitEvents); List completedForms = pluralsHolder.getCompletedForms(targetLocale); @@ -202,20 +201,26 @@ private void setUsagesAnnotationOnTextUnit(Set usagesFromSkeleton, IText class PoPluralsHolder extends PluralsHolder { + private final String singularSource; + private final String pluralSource; + + PoPluralsHolder(String singularSource, String pluralSource) { + this.singularSource = singularSource; + this.pluralSource = pluralSource; + } + @Override public List getCompletedForms(LocaleId localeId) { if (other == null) { if (few != null) { - logger.debug( - "Other is not defined but few is, means it is for a language where few can be copied like Russian"); + logger.debug("other is not defined, copying from few (e.g. Russian)"); other = createCopyOf(few, "few", "other"); } else if (zero != null) { - logger.debug( - "Other and few are not defined but one is, means it is for a language where few can be copied like Arabic"); + logger.debug("other and few are not defined, copying from zero (e.g. Arabic)"); other = createCopyOf(zero, "zero", "other"); } else if (two != null) { - logger.debug("Other, few and zero are not defined but one is, copy from two."); + logger.debug("other, few and zero are not defined, copying from two"); other = createCopyOf(two, "two", "other"); } } @@ -226,11 +231,9 @@ public List getCompletedForms(LocaleId localeId) { @Override void adaptTextUnitToCLDRForm(ITextUnit textUnit, String cldrPluralForm) { if ("one".equals(cldrPluralForm)) { - logger.debug("Set message singular: {}", msgID); - textUnit.setSource(new TextContainer(msgID)); + textUnit.setSource(new TextContainer(singularSource)); } else { - logger.debug("Set message plural: {}", msgIDPlural); - textUnit.setSource(new TextContainer(msgIDPlural)); + textUnit.setSource(new TextContainer(pluralSource)); } } @@ -321,26 +324,6 @@ void rewritePluralFormInHeader(DocumentPart documentPart) { } } - void loadMsgIDPluralFromParent() { - Field msgIDPluralParent = - ReflectionUtils.findField(net.sf.okapi.filters.po.POFilter.class, "msgIDPlural"); - msgIDPlural = makeAccessibleAndGetString(msgIDPluralParent); - } - - void loadMsgIDFromParent() { - Field msgIDParent = ReflectionUtils.findField(net.sf.okapi.filters.po.POFilter.class, "msgID"); - msgID = makeAccessibleAndGetString(msgIDParent); - } - - String makeAccessibleAndGetString(Field msgID) { - ReflectionUtils.makeAccessible(msgID); - try { - return (String) msgID.get(this); - } catch (IllegalAccessException | IllegalArgumentException e) { - throw new RuntimeException(e); - } - } - void addUsagesToTextUnit(TextUnit textUnit) { Set usageLocationsFromSkeleton = getUsagesFromSkeleton(textUnit.getSkeleton().toString()); From c25185d3f62f9c85198875ec0994ac9de86f09dd Mon Sep 17 00:00:00 2001 From: Wadim Wawrzenczak Date: Tue, 24 Mar 2026 23:14:27 +0100 Subject: [PATCH 11/13] Handle nPlurals=1 edge case via skeleton extraction When a locale has only one plural form (nPlurals=1, e.g. Japanese), Okapi emits a single TEXT_UNIT with the singular source (msgid). But CLDR maps this to the "other" form, which should carry the plural source (msgid_plural). Since there is no second text unit to read it from, extract msgid_plural directly from the START_GROUP skeleton and unescape it via the parent filter's private unescape method (accessed through reflection because it has no public alternative). Also updates class-level and method-level Javadoc across the file to document the overridden behavior, the PO/CLDR plural model mismatch, and the CLDR category mapping rationale in adaptTextUnitToCLDRForm. --- .../l10n/mojito/okapi/filters/POFilter.java | 146 +++++++++++++++--- 1 file changed, 126 insertions(+), 20 deletions(-) diff --git a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java index fe4f23cbc4..161d7e936a 100644 --- a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java +++ b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java @@ -27,36 +27,53 @@ import org.springframework.beans.factory.annotation.Configurable; /** - * Extends {@link net.sf.okapi.filters.po.POFilter} to somehow support gettext - * plural and surface message context as part of the textunit name. - *

- * Maps po plural form to cldr using {@link PoPluralRuleHelper + * Extends {@link net.sf.okapi.filters.po.POFilter} to provide human-readable text unit names and + * add gettext plural support. + * + *

Text Unit names

+ * + * Okapi's base filter produces text units with generated names. This subclass updates them on the + * fly, replacing them with human-readable data, i.e. the source string followed by msgctxt if + * present. + * + *

Plural support

+ * + *

When parsing a plural group, Okapi's base filter simply emits one text unit per each target + * variant (msgstr line). This subclass intercepts the event stream to: + * + *

    + *
  • Rewrite the Plural-Forms header to match rules for the target locale + *
  • Map PO plural forms (from msgstr indices) to CLDR forms (zero, one, two, few, many, other) + * (via {@link PoPluralRule}) and append to the text unit name + *
  • Complete missing CLDR forms according to rules for the target locale + *
+ * + *

Other notes

+ * + *

This filter also extracts source-code reference comments (#:) as usage annotations * * @author jaurambualt */ @Configurable public class POFilter extends net.sf.okapi.filters.po.POFilter { - /** logger */ static Logger logger = LoggerFactory.getLogger(POFilter.class); public static final String FILTER_CONFIG_ID = "okf_po@mojito"; - static final String USAGE_LOCATION_GROUP_NAME = "location"; - static final String USAGE_LOCATION_PATTERN = "#: (?.*)"; - // Having " _" as plural separator wasn't very wise... Android filter for example doesn't have a // space: "_". // see {@link ThirdPartySyncCommand} too. // Consider cleanup but that not that simple as it would require migrating data... static final String PLURAL_SEPARATOR = " _"; + static final String USAGE_LOCATION_GROUP_NAME = "location"; + static final String USAGE_LOCATION_PATTERN = "#: (?.*)"; + List eventQueue = new ArrayList<>(); LocaleId targetLocale; - PoPluralRule poPluralRule; - boolean hasCopyFormsOnImport = false; @Override @@ -107,6 +124,20 @@ public boolean hasNext() { return !eventQueue.isEmpty() || super.hasNext(); } + /** + * The event loop is overridden to intercept events from the parent Okapi POFilter and: + * + *

    + *
  • Rewrite the Plural-Forms header when localizing to a target locale + *
  • Set text unit names to the source content, with msgctxt appended when present + *
  • Extract source-code reference comments (#:) as usage annotations + *
  • Buffer and rewrite plural text unit groups via {@link POFilter#processNextPluralGroup} + *
+ * + * Non-plural events are processed and returned immediately. Plural groups consume multiple parent + * events at once and buffer the results in {@link #eventQueue} so they can be yielded one at a + * time by subsequent {@link #next()} calls. + */ @Override public Event next() { if (!eventQueue.isEmpty()) { @@ -143,10 +174,21 @@ private boolean isPluralGroupStarting(Event event) { * Consumes a complete plural group from the parent filter (START_GROUP -> TEXT_UNIT* -> * END_GROUP), remaps PO form indices to CLDR plural forms, completes any missing forms for the * target locale, and buffers all resulting events in {@link #eventQueue}. + * + *

The parent's Okapi POFilter emits one TEXT_UNIT per msgstr[N]. The source on form 0 is the + * msgid (singular), and on form 1+ it's msgid_plural. We capture these when processing the group + * to set the correct text unit name and source on each plural variant. + * + *

The name of every plural variant is derived from the singular source (msgid), not from the + * variant's own source — this keeps plural text unit names consistent across all forms. + * + *

The source must also be corrected to handle an edge case in how Okapi assigns sources when + * the file has more plural forms than the target locale needs. See {@link + * #extractPluralSourceFromSkeleton} for the workaround. */ void processNextPluralGroup(Event startGroupEvent) { - Set usagesFromSkeleton = - getUsagesFromSkeleton(startGroupEvent.getStartGroup().getSkeleton().toString()); + String startGroupSkeleton = startGroupEvent.getStartGroup().getSkeleton().toString(); + Set usagesFromSkeleton = getUsagesFromSkeleton(startGroupSkeleton); eventQueue.add(startGroupEvent); @@ -162,12 +204,13 @@ void processNextPluralGroup(Event startGroupEvent) { String singularSource = textUnitEvents.get(0).getTextUnit().getSource().toString(); // When nPlurals >= 2 the second text unit's source is the msgid_plural. - // When nPlurals == 1 (e.g. Japanese) there is only one text unit — the next commit - // adds extractPluralSourceFromSkeleton to handle that edge case correctly. + // When nPlurals == 1 (e.g. Japanese) there is no second text unit, but we still need + // msgid_plural — see extractPluralSourceFromSkeleton javadoc for details on the PO/CLDR + // mismatch. String pluralSource = textUnitEvents.size() > 1 ? textUnitEvents.get(1).getTextUnit().getSource().toString() - : singularSource; + : extractPluralSourceFromSkeleton(startGroupSkeleton); for (int i = 0; i < textUnitEvents.size(); i++) { ITextUnit textUnit = textUnitEvents.get(i).getTextUnit(); @@ -230,6 +273,10 @@ public List getCompletedForms(LocaleId localeId) { @Override void adaptTextUnitToCLDRForm(ITextUnit textUnit, String cldrPluralForm) { + // It's ok to map EN `other` to every non-"one" target category regardless of the target + // language rules, because the UI i18n libraries themselves account for cross-language CLDR + // category mappings at render time. Mojito just needs to generate enough categories to + // satisfy the CLDR rules. if ("one".equals(cldrPluralForm)) { textUnit.setSource(new TextContainer(singularSource)); } else { @@ -313,17 +360,76 @@ void appendPluralFormToName(ITextUnit textUnit, String cldrPluralForm) { textUnit.setName(textUnit.getName() + PLURAL_SEPARATOR + cldrPluralForm); } - /** - * Rewrite the plural forms if processing a target locale. - * - * @param documentPart - */ void rewritePluralFormInHeader(DocumentPart documentPart) { if (targetLocale != null && !LocaleId.EMPTY.equals(targetLocale)) { documentPart.setProperty(new Property("pluralforms", poPluralRule.getRule())); } } + /** + * Extracts and unescapes the {@code msgid_plural} value from the START_GROUP skeleton. + * + *

This is needed because of a mismatch between PO and CLDR plural models. Okapi assigns + * sources by PO index: {@code msgstr[0]} always gets {@code msgid} (singular) and {@code + * msgstr[1+]} gets {@code msgid_plural}. But CLDR's "other" form — which should have the plural + * source — can map to {@code msgstr[0]} for locales with a single plural form (e.g. Japanese + * nPlurals=1). In that case Okapi only emits one TEXT_UNIT with the singular source, yet we need + * the plural source to match the text unit that extraction stored in the database (extraction is + * locale-independent and always assigns {@code msgid_plural} as the source for the "other" form). + * + *

The skeleton always contains the raw {@code msgid_plural "..."} line(s) since that is what + * triggers the START_GROUP event in Okapi's POFilter. The raw quoted content is extracted here + * and unescaped via the parent's {@code unescape} method (accessed through reflection because it + * is private with no public alternative). + */ + String extractPluralSourceFromSkeleton(String skeleton) { + StringBuilder raw = new StringBuilder(); + boolean inMsgIdPlural = false; + + for (String line : skeleton.split("\\R")) { + String trimmed = line.trim(); + if (trimmed.startsWith("msgid_plural")) { + inMsgIdPlural = true; + appendQuotedContent(raw, line); + } else if (inMsgIdPlural && trimmed.startsWith("\"")) { + appendQuotedContent(raw, line); + } else if (inMsgIdPlural) { + break; + } + } + + return unescapePo(raw.toString()); + } + + private static void appendQuotedContent(StringBuilder sb, String line) { + int first = line.indexOf('"'); + int last = line.lastIndexOf('"'); + if (first >= 0 && last > first) { + sb.append(line, first + 1, last); + } + } + + /** Delegates to the parent's private {@code unescape} method via reflection. */ + private String unescapePo(String text) { + try { + return (String) PARENT_UNESCAPE.invoke(this, text); + } catch (ReflectiveOperationException e) { + throw new RuntimeException("Failed to invoke Okapi POFilter.unescape", e); + } + } + + private static final java.lang.reflect.Method PARENT_UNESCAPE; + + static { + try { + PARENT_UNESCAPE = + net.sf.okapi.filters.po.POFilter.class.getDeclaredMethod("unescape", String.class); + PARENT_UNESCAPE.setAccessible(true); + } catch (NoSuchMethodException e) { + throw new ExceptionInInitializerError(e); + } + } + void addUsagesToTextUnit(TextUnit textUnit) { Set usageLocationsFromSkeleton = getUsagesFromSkeleton(textUnit.getSkeleton().toString()); From aeca11927ec3127ba96dce586e7b4457ca7986b9 Mon Sep 17 00:00:00 2001 From: Wadim Wawrzenczak Date: Wed, 18 Mar 2026 18:17:50 +0100 Subject: [PATCH 12/13] Remove unused POEncoder Generation of localized PO files is now handled by Okapi's builtin default POEncoder. --- .../l10n/mojito/okapi/filters/POEncoder.java | 20 ------------------- 1 file changed, 20 deletions(-) delete mode 100644 common/src/main/java/com/box/l10n/mojito/okapi/filters/POEncoder.java diff --git a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POEncoder.java b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POEncoder.java deleted file mode 100644 index 6ac81f64c7..0000000000 --- a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POEncoder.java +++ /dev/null @@ -1,20 +0,0 @@ -package com.box.l10n.mojito.okapi.filters; - -import net.sf.okapi.common.resource.Property; - -public class POEncoder extends SimpleEncoder { - - @Override - public String toNative(String propertyName, String value) { - if (Property.APPROVED.equals(propertyName)) { - if ((value != null) && (value.equals("no"))) { - return "fuzzy"; - } else { // Don't set the fuzzy flag - return ""; - } - } - - // No changes for the other values - return value; - } -} From 6d031306dbb790a99192055422556d731fa4a9ef Mon Sep 17 00:00:00 2001 From: Wadim Wawrzenczak Date: Wed, 18 Mar 2026 18:57:19 +0100 Subject: [PATCH 13/13] Filter out events with no CLDR category mapping --- .../com/box/l10n/mojito/okapi/filters/POFilter.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java index 161d7e936a..b42dcbfdc8 100644 --- a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java +++ b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java @@ -212,17 +212,21 @@ void processNextPluralGroup(Event startGroupEvent) { ? textUnitEvents.get(1).getTextUnit().getSource().toString() : extractPluralSourceFromSkeleton(startGroupSkeleton); + // Text units whose PO form index has no CLDR mapping (e.g. form 1 for Japanese ONE_FORM) + // are filtered out + List filteredEvents = new ArrayList<>(); for (int i = 0; i < textUnitEvents.size(); i++) { - ITextUnit textUnit = textUnitEvents.get(i).getTextUnit(); - setTextUnitName(textUnit, singularSource); String cldrForm = poPluralRule.poFormToCldrForm(Integer.toString(i)); if (cldrForm != null) { + ITextUnit textUnit = textUnitEvents.get(i).getTextUnit(); + setTextUnitName(textUnit, singularSource); appendPluralFormToName(textUnit, cldrForm); + filteredEvents.add(textUnitEvents.get(i)); } } PluralsHolder pluralsHolder = new PoPluralsHolder(singularSource, pluralSource); - pluralsHolder.loadEvents(textUnitEvents); + pluralsHolder.loadEvents(filteredEvents); List completedForms = pluralsHolder.getCompletedForms(targetLocale); for (Event e : completedForms) {