diff --git a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POEncoder.java b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POEncoder.java deleted file mode 100644 index 6ac81f64c7..0000000000 --- a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POEncoder.java +++ /dev/null @@ -1,20 +0,0 @@ -package com.box.l10n.mojito.okapi.filters; - -import net.sf.okapi.common.resource.Property; - -public class POEncoder extends SimpleEncoder { - - @Override - public String toNative(String propertyName, String value) { - if (Property.APPROVED.equals(propertyName)) { - if ((value != null) && (value.equals("no"))) { - return "fuzzy"; - } else { // Don't set the fuzzy flag - return ""; - } - } - - // No changes for the other values - return value; - } -} diff --git a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java index 73f8670851..b42dcbfdc8 100644 --- a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java +++ b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java @@ -1,10 +1,8 @@ package com.box.l10n.mojito.okapi.filters; -import com.box.l10n.mojito.okapi.TextUnitUtils; import com.box.l10n.mojito.okapi.steps.OutputDocumentPostProcessingAnnotation; import com.box.l10n.mojito.po.PoPluralRule; import com.google.common.collect.Multimap; -import java.lang.reflect.Field; import java.util.ArrayList; import java.util.LinkedHashSet; import java.util.List; @@ -15,8 +13,6 @@ import java.util.regex.Pattern; import net.sf.okapi.common.Event; import net.sf.okapi.common.LocaleId; -import net.sf.okapi.common.MimeTypeMapper; -import net.sf.okapi.common.encoder.EncoderManager; import net.sf.okapi.common.filters.FilterConfiguration; import net.sf.okapi.common.resource.DocumentPart; import net.sf.okapi.common.resource.ITextUnit; @@ -28,55 +24,58 @@ import net.sf.okapi.common.skeleton.GenericSkeletonPart; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Configurable; -import org.springframework.util.ReflectionUtils; /** - * Extends {@link net.sf.okapi.filters.po.POFilter} to somehow support gettext - * plural and surface message context as part of the textunit name. - *

- * Maps po plural form to cldr using {@link PoPluralRuleHelper + * Extends {@link net.sf.okapi.filters.po.POFilter} to provide human-readable text unit names and + * add gettext plural support. + * + *

Text Unit names

+ * + * Okapi's base filter produces text units with generated names. This subclass updates them on the + * fly, replacing them with human-readable data, i.e. the source string followed by msgctxt if + * present. + * + *

Plural support

+ * + *

When parsing a plural group, Okapi's base filter simply emits one text unit per each target + * variant (msgstr line). This subclass intercepts the event stream to: + * + *

+ * + *

Other notes

+ * + *

This filter also extracts source-code reference comments (#:) as usage annotations * * @author jaurambualt */ @Configurable public class POFilter extends net.sf.okapi.filters.po.POFilter { - /** logger */ static Logger logger = LoggerFactory.getLogger(POFilter.class); public static final String FILTER_CONFIG_ID = "okf_po@mojito"; - static final String USAGE_LOCATION_GROUP_NAME = "location"; - static final String USAGE_LOCATION_PATTERN = "#: (?.*)"; - // Having " _" as plural separator wasn't very wise... Android filter for example doesn't have a // space: "_". // see {@link ThirdPartySyncCommand} too. // Consider cleanup but that not that simple as it would require migrating data... static final String PLURAL_SEPARATOR = " _"; - @Autowired TextUnitUtils textUnitUtils; - - @Autowired UnescapeUtils unescapeUtils; + static final String USAGE_LOCATION_GROUP_NAME = "location"; + static final String USAGE_LOCATION_PATTERN = "#: (?.*)"; List eventQueue = new ArrayList<>(); LocaleId targetLocale; - PoPluralRule poPluralRule; - boolean hasCopyFormsOnImport = false; - String msgIDPlural; - - String msgID; - - Integer poPluralForm; - - EncoderManager encoderManager; - @Override public String getName() { return FILTER_CONFIG_ID; @@ -125,135 +124,122 @@ public boolean hasNext() { return !eventQueue.isEmpty() || super.hasNext(); } + /** + * The event loop is overridden to intercept events from the parent Okapi POFilter and: + * + *

+ * + * Non-plural events are processed and returned immediately. Plural groups consume multiple parent + * events at once and buffer the results in {@link #eventQueue} so they can be yielded one at a + * time by subsequent {@link #next()} calls. + */ @Override public Event next() { - Event event; - - if (eventQueue.isEmpty()) { - readNextEvents(); + if (!eventQueue.isEmpty()) { + return eventQueue.remove(0); } - event = eventQueue.remove(0); + Event event = super.next(); - return event; - } - - void readNextEvents() { - Event next = getNextWithProcess(); + if (isPluralGroupStarting(event)) { + processNextPluralGroup(event); + return eventQueue.remove(0); + } - if (isPluralGroupStarting(next)) { - poPluralForm = 0; - readPlurals(next); - } else { - if (next.isDocumentPart()) { - rewritePluralFormInHeader(next.getDocumentPart()); - } - eventQueue.add(next); + if (event.isDocumentPart()) { + rewritePluralFormInHeader(event.getDocumentPart()); } - } - void processTextUnit(Event event) { - if (event != null && event.isTextUnit()) { + if (event.isTextUnit()) { TextUnit textUnit = (TextUnit) event.getTextUnit(); - renameTextUnitWithSourceAndContent(textUnit); - unescpae(textUnit); + setTextUnitName(textUnit, textUnit.getSource().toString()); addUsagesToTextUnit(textUnit); } - } - - void unescpae(TextUnit textUnit) { - unescapeSource(textUnit); - unescapeTarget(textUnit); - } - - void unescapeSource(TextUnit textUnit) { - String sourceString = textUnitUtils.getSourceAsString(textUnit); - String unescapedSourceString = unescapeUtils.replaceEscapedQuotes(sourceString); - textUnitUtils.replaceSourceString(textUnit, unescapedSourceString); - } - void unescapeTarget(TextUnit textUnit) { - TextContainer target = textUnit.getTarget(targetLocale); - if (target != null) { - String targetString = target.toString(); - String unescapedTargetString = unescapeUtils.replaceEscapedQuotes(targetString); - TextContainer newTarget = new TextContainer(unescapedTargetString); - textUnit.setTarget(targetLocale, newTarget); - } + return event; } - boolean isPluralGroupStarting(Event event) { + private boolean isPluralGroupStarting(Event event) { return event != null && event.isStartGroup() && "x-gettext-plurals".equals(event.getStartGroup().getType()); } - boolean isPluralGroupEnding(Event event) { - return poPluralForm != null && event != null && event.isEndGroup(); - } - - Event getNextWithProcess() { - Event next = super.next(); - loadMsgIDFromParent(); - processTextUnit(next); - return next; - } - - void readPlurals(Event next) { - - logger.debug("First event is the start group, load msgidplural from parent and move to next"); - Set usagesFromSkeleton = - getUsagesFromSkeleton(next.getStartGroup().getSkeleton().toString()); - - loadMsgIDPluralFromParent(); - eventQueue.add(next); - - List pluralEvents = new ArrayList<>(); - next = getNextWithProcess(); - - // add the start event - pluralEvents.add(next); + /** + * Consumes a complete plural group from the parent filter (START_GROUP -> TEXT_UNIT* -> + * END_GROUP), remaps PO form indices to CLDR plural forms, completes any missing forms for the + * target locale, and buffers all resulting events in {@link #eventQueue}. + * + *

The parent's Okapi POFilter emits one TEXT_UNIT per msgstr[N]. The source on form 0 is the + * msgid (singular), and on form 1+ it's msgid_plural. We capture these when processing the group + * to set the correct text unit name and source on each plural variant. + * + *

The name of every plural variant is derived from the singular source (msgid), not from the + * variant's own source — this keeps plural text unit names consistent across all forms. + * + *

The source must also be corrected to handle an edge case in how Okapi assigns sources when + * the file has more plural forms than the target locale needs. See {@link + * #extractPluralSourceFromSkeleton} for the workaround. + */ + void processNextPluralGroup(Event startGroupEvent) { + String startGroupSkeleton = startGroupEvent.getStartGroup().getSkeleton().toString(); + Set usagesFromSkeleton = getUsagesFromSkeleton(startGroupSkeleton); - poPluralForm++; - next = getNextWithProcess(); + eventQueue.add(startGroupEvent); - // read others until the end - while (next != null && !isPluralGroupEnding(next)) { - pluralEvents.add(next); - poPluralForm++; - next = getNextWithProcess(); + List textUnitEvents = new ArrayList<>(); + Event next = super.next(); + while (next != null && !next.isEndGroup()) { + if (next.isTextUnit()) { + textUnitEvents.add(next); + } + next = super.next(); } - poPluralForm = null; - - // that doesn't contain last - pluralEvents = adaptPlurals(pluralEvents, usagesFromSkeleton); - - eventQueue.addAll(pluralEvents); - - if (isPluralGroupStarting(next)) { - poPluralForm = 0; - readPlurals(next); - } else { - eventQueue.add(next); + String singularSource = textUnitEvents.get(0).getTextUnit().getSource().toString(); + + // When nPlurals >= 2 the second text unit's source is the msgid_plural. + // When nPlurals == 1 (e.g. Japanese) there is no second text unit, but we still need + // msgid_plural — see extractPluralSourceFromSkeleton javadoc for details on the PO/CLDR + // mismatch. + String pluralSource = + textUnitEvents.size() > 1 + ? textUnitEvents.get(1).getTextUnit().getSource().toString() + : extractPluralSourceFromSkeleton(startGroupSkeleton); + + // Text units whose PO form index has no CLDR mapping (e.g. form 1 for Japanese ONE_FORM) + // are filtered out + List filteredEvents = new ArrayList<>(); + for (int i = 0; i < textUnitEvents.size(); i++) { + String cldrForm = poPluralRule.poFormToCldrForm(Integer.toString(i)); + if (cldrForm != null) { + ITextUnit textUnit = textUnitEvents.get(i).getTextUnit(); + setTextUnitName(textUnit, singularSource); + appendPluralFormToName(textUnit, cldrForm); + filteredEvents.add(textUnitEvents.get(i)); + } } - } - List adaptPlurals(List pluralEvents, Set usagesFromSkeleton) { - logger.debug("Adapt plural forms if needed"); - PluralsHolder pluralsHolder = new PoPluralsHolder(); - pluralsHolder.loadEvents(pluralEvents); + PluralsHolder pluralsHolder = new PoPluralsHolder(singularSource, pluralSource); + pluralsHolder.loadEvents(filteredEvents); List completedForms = pluralsHolder.getCompletedForms(targetLocale); - setUsagesOnTextUnits(completedForms, usagesFromSkeleton); - return completedForms; - } - private void setUsagesOnTextUnits(List pluralEvents, Set usagesFromSkeleton) { - for (Event pluralEvent : pluralEvents) { - if (pluralEvent.isTextUnit()) { - setUsagesAnnotationOnTextUnit(usagesFromSkeleton, pluralEvent.getTextUnit()); + for (Event e : completedForms) { + if (e.isTextUnit()) { + setUsagesAnnotationOnTextUnit(usagesFromSkeleton, e.getTextUnit()); } } + + eventQueue.addAll(completedForms); + + if (next != null) { + eventQueue.add(next); + } } private void setUsagesAnnotationOnTextUnit(Set usagesFromSkeleton, ITextUnit textUnit) { @@ -262,20 +248,26 @@ private void setUsagesAnnotationOnTextUnit(Set usagesFromSkeleton, IText class PoPluralsHolder extends PluralsHolder { + private final String singularSource; + private final String pluralSource; + + PoPluralsHolder(String singularSource, String pluralSource) { + this.singularSource = singularSource; + this.pluralSource = pluralSource; + } + @Override public List getCompletedForms(LocaleId localeId) { if (other == null) { if (few != null) { - logger.debug( - "Other is not defined but few is, means it is for a language where few can be copied like Russian"); + logger.debug("other is not defined, copying from few (e.g. Russian)"); other = createCopyOf(few, "few", "other"); } else if (zero != null) { - logger.debug( - "Other and few are not defined but one is, means it is for a language where few can be copied like Arabic"); + logger.debug("other and few are not defined, copying from zero (e.g. Arabic)"); other = createCopyOf(zero, "zero", "other"); } else if (two != null) { - logger.debug("Other, few and zero are not defined but one is, copy from two."); + logger.debug("other, few and zero are not defined, copying from two"); other = createCopyOf(two, "two", "other"); } } @@ -285,18 +277,14 @@ public List getCompletedForms(LocaleId localeId) { @Override void adaptTextUnitToCLDRForm(ITextUnit textUnit, String cldrPluralForm) { - + // It's ok to map EN `other` to every non-"one" target category regardless of the target + // language rules, because the UI i18n libraries themselves account for cross-language CLDR + // category mappings at render time. Mojito just needs to generate enough categories to + // satisfy the CLDR rules. if ("one".equals(cldrPluralForm)) { - // source should always be singular form for "one" form, - // this is needed for language with 6 entry like arabic - logger.debug("Set message singular: {}", msgID); - textUnit.setSource(new TextContainer(unescapeUtils.replaceEscapedQuotes(msgID))); + textUnit.setSource(new TextContainer(singularSource)); } else { - // source should always be plural form unless for "one" form, - // this is needed for language with only one entry like - // japanese: [0] --> other - logger.debug("Set message plural: {}", msgIDPlural); - textUnit.setSource(new TextContainer(unescapeUtils.replaceEscapedQuotes(msgIDPlural))); + textUnit.setSource(new TextContainer(pluralSource)); } } @@ -348,61 +336,101 @@ void retainForms(LocaleId localeId, List pluralForms) { } /** - * If context is present, add it to the text unit name. We keep the generated ID by Okapi for - * prefix of the text unit name allows to distinguish plural form easily. - * - *

Note: Decided not to go with empty string id only based the message context to have more - * consistent IDs and support plural forms. It has a little draw back when searching for string - * though as it prevents exact match on context. + * Sets the text unit name to the given base name, appending the msgctxt when present. Examples: * - * @param textUnit + *

*/ - void renameTextUnitWithSourceAndContent(ITextUnit textUnit) { - + void setTextUnitName(ITextUnit textUnit, String baseName) { Property property = textUnit.getProperty(POFilter.PROPERTY_CONTEXT); - StringBuilder newName = new StringBuilder(msgID); + StringBuilder newName = new StringBuilder(baseName); if (property != null) { newName.append(" --- ").append(property.getValue()); } - if (poPluralForm != null) { - String cldrForm = poPluralRule.poFormToCldrForm(Integer.toString(poPluralForm)); - newName.append(PLURAL_SEPARATOR).append(cldrForm); - } - textUnit.setName(newName.toString()); } /** - * Rewrite the plural forms if processing a target locale. - * - * @param documentPart + * Appends a CLDR plural form suffix to the text unit's current name, e.g. {@code "item"} becomes + * {@code "item _other"}. The suffix must be set before {@link PluralsHolder#loadEvents} so that + * {@link PluralsHolder#getCldrPluralFormOfEvent} can parse the form back from the name. */ + void appendPluralFormToName(ITextUnit textUnit, String cldrPluralForm) { + textUnit.setName(textUnit.getName() + PLURAL_SEPARATOR + cldrPluralForm); + } + void rewritePluralFormInHeader(DocumentPart documentPart) { if (targetLocale != null && !LocaleId.EMPTY.equals(targetLocale)) { documentPart.setProperty(new Property("pluralforms", poPluralRule.getRule())); } } - void loadMsgIDPluralFromParent() { - Field msgIDPluralParent = - ReflectionUtils.findField(net.sf.okapi.filters.po.POFilter.class, "msgIDPlural"); - msgIDPlural = makeAccessibleAndGetString(msgIDPluralParent); + /** + * Extracts and unescapes the {@code msgid_plural} value from the START_GROUP skeleton. + * + *

This is needed because of a mismatch between PO and CLDR plural models. Okapi assigns + * sources by PO index: {@code msgstr[0]} always gets {@code msgid} (singular) and {@code + * msgstr[1+]} gets {@code msgid_plural}. But CLDR's "other" form — which should have the plural + * source — can map to {@code msgstr[0]} for locales with a single plural form (e.g. Japanese + * nPlurals=1). In that case Okapi only emits one TEXT_UNIT with the singular source, yet we need + * the plural source to match the text unit that extraction stored in the database (extraction is + * locale-independent and always assigns {@code msgid_plural} as the source for the "other" form). + * + *

The skeleton always contains the raw {@code msgid_plural "..."} line(s) since that is what + * triggers the START_GROUP event in Okapi's POFilter. The raw quoted content is extracted here + * and unescaped via the parent's {@code unescape} method (accessed through reflection because it + * is private with no public alternative). + */ + String extractPluralSourceFromSkeleton(String skeleton) { + StringBuilder raw = new StringBuilder(); + boolean inMsgIdPlural = false; + + for (String line : skeleton.split("\\R")) { + String trimmed = line.trim(); + if (trimmed.startsWith("msgid_plural")) { + inMsgIdPlural = true; + appendQuotedContent(raw, line); + } else if (inMsgIdPlural && trimmed.startsWith("\"")) { + appendQuotedContent(raw, line); + } else if (inMsgIdPlural) { + break; + } + } + + return unescapePo(raw.toString()); } - void loadMsgIDFromParent() { - Field msgIDParent = ReflectionUtils.findField(net.sf.okapi.filters.po.POFilter.class, "msgID"); - msgID = makeAccessibleAndGetString(msgIDParent); + private static void appendQuotedContent(StringBuilder sb, String line) { + int first = line.indexOf('"'); + int last = line.lastIndexOf('"'); + if (first >= 0 && last > first) { + sb.append(line, first + 1, last); + } } - String makeAccessibleAndGetString(Field msgID) { - ReflectionUtils.makeAccessible(msgID); + /** Delegates to the parent's private {@code unescape} method via reflection. */ + private String unescapePo(String text) { try { - return (String) msgID.get(this); - } catch (IllegalAccessException | IllegalArgumentException e) { - throw new RuntimeException(e); + return (String) PARENT_UNESCAPE.invoke(this, text); + } catch (ReflectiveOperationException e) { + throw new RuntimeException("Failed to invoke Okapi POFilter.unescape", e); + } + } + + private static final java.lang.reflect.Method PARENT_UNESCAPE; + + static { + try { + PARENT_UNESCAPE = + net.sf.okapi.filters.po.POFilter.class.getDeclaredMethod("unescape", String.class); + PARENT_UNESCAPE.setAccessible(true); + } catch (NoSuchMethodException e) { + throw new ExceptionInInitializerError(e); } } @@ -425,16 +453,6 @@ Set getUsagesFromSkeleton(String skeleton) { return locations; } - @Override - public EncoderManager getEncoderManager() { - if (encoderManager == null) { - encoderManager = new EncoderManager(); - encoderManager.setMapping( - MimeTypeMapper.PO_MIME_TYPE, "com.box.l10n.mojito.okapi.filters.POEncoder"); - } - return encoderManager; - } - static String removeUntranslated(String poFile) { StringBuilder poFileWithoutUntranslatedPlural = new StringBuilder(); Scanner scanner = new Scanner(poFile); diff --git a/common/src/test/java/com/box/l10n/mojito/okapi/filters/POFilterTest.java b/common/src/test/java/com/box/l10n/mojito/okapi/filters/POFilterTest.java index c72a1f13d5..e98ebeca0a 100644 --- a/common/src/test/java/com/box/l10n/mojito/okapi/filters/POFilterTest.java +++ b/common/src/test/java/com/box/l10n/mojito/okapi/filters/POFilterTest.java @@ -1,7 +1,6 @@ package com.box.l10n.mojito.okapi.filters; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNull; import java.util.ArrayList; import java.util.List; @@ -13,20 +12,6 @@ */ public class POFilterTest { - @Test - public void loadMsgIDPluralFromParent() { - POFilter poFilter = new POFilter(); - poFilter.loadMsgIDPluralFromParent(); - assertNull(poFilter.msgIDPlural); - } - - @Test - public void loadMsgIDFromParent() { - POFilter poFilter = new POFilter(); - poFilter.loadMsgIDFromParent(); - assertNull(poFilter.msgID); - } - @Test public void getUsagesFromSkeleton() { String skeleton = diff --git a/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java b/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java index 31d826486e..dc34e3f2e5 100644 --- a/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java +++ b/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java @@ -64,11 +64,13 @@ public void extractWithEscapedDoubleQuotes() throws UnsupportedAssetFilterTypeEx Assertions.assertThat(units) .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) - .containsExactly(tuple("He said \\\"hello\\\"", "He said \"hello\"")); + .containsExactly(tuple("He said \"hello\"", "He said \"hello\"")); } @Test - public void extractWithEscapedSingleQuotes() throws UnsupportedAssetFilterTypeException { + public void extractWithBackslashAndSingleQuote() throws UnsupportedAssetFilterTypeException { + // \' is not a standard PO escape sequence (PO uses double-quote delimiters), + // so the backslash is preserved as a literal character by Okapi's parser. List units = extract( "test.pot", @@ -81,7 +83,7 @@ public void extractWithEscapedSingleQuotes() throws UnsupportedAssetFilterTypeEx Assertions.assertThat(units) .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) - .containsExactly(tuple("it\\'s a test", "it's a test")); + .containsExactly(tuple("it\\'s a test", "it\\'s a test")); } @Test @@ -157,11 +159,159 @@ public void extractPluralsWithEscapedQuotes() throws UnsupportedAssetFilterTypeE AssetExtractorTextUnit::getPluralForm, AssetExtractorTextUnit::getPluralFormOther) .containsExactly( - tuple("the \\\"item\\\" _zero", "the \"items\"", "zero", "the \\\"item\\\" _other"), - tuple("the \\\"item\\\" _one", "the \"item\"", "one", "the \\\"item\\\" _other"), - tuple("the \\\"item\\\" _two", "the \"items\"", "two", "the \\\"item\\\" _other"), - tuple("the \\\"item\\\" _few", "the \"items\"", "few", "the \\\"item\\\" _other"), - tuple("the \\\"item\\\" _many", "the \"items\"", "many", "the \\\"item\\\" _other"), - tuple("the \\\"item\\\" _other", "the \"items\"", "other", "the \\\"item\\\" _other")); + tuple("the \"item\" _zero", "the \"items\"", "zero", "the \"item\" _other"), + tuple("the \"item\" _one", "the \"item\"", "one", "the \"item\" _other"), + tuple("the \"item\" _two", "the \"items\"", "two", "the \"item\" _other"), + tuple("the \"item\" _few", "the \"items\"", "few", "the \"item\" _other"), + tuple("the \"item\" _many", "the \"items\"", "many", "the \"item\" _other"), + tuple("the \"item\" _other", "the \"items\"", "other", "the \"item\" _other")); + } + + @Test + public void extractWithEscapedBackslashQuoteNewlineTab() + throws UnsupportedAssetFilterTypeException { + // based on: + // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L241-251 + List units = + extract( + "test.pot", + """ + msgid "Text \\\\ and \\" and \\n and \\t" + msgstr "" + """, + null, + null); + + Assertions.assertThat(units) + .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) + .containsExactly(tuple("Text \\ and \" and \n and \t", "Text \\ and \" and \n and \t")); + } + + @Test + public void extractWithEscapedFormFeedBackspaceCarriageReturn() + throws UnsupportedAssetFilterTypeException { + // based on: + // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L253-263 + List units = + extract( + "test.pot", + """ + msgid "Text \\f and \\b and \\rr" + msgstr "" + """, + null, + null); + + Assertions.assertThat(units) + .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) + .containsExactly(tuple("Text \f and \b and \rr", "Text \f and \b and \rr")); + } + + @Test + public void extractWithEscapeAtStartOfString() throws UnsupportedAssetFilterTypeException { + // based on: + // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L265-276 + List units = + extract( + "test.pot", + """ + msgid "\\t." + msgstr "" + """, + null, + null); + + Assertions.assertThat(units) + .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) + .containsExactly(tuple("\t.", "\t.")); + } + + @Test + public void extractWithUnescapedQuoteInMiddle() throws UnsupportedAssetFilterTypeException { + // based on: + // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L280-290 + + // Verifies parity with how Okapi handles malformed escaping in PO files + + // Malformed PO: unescaped quote in middle of msgid string. + // PO Parser uses lastIndexOf('"') to find the closing delimiter, + // so given a PO snippet: + // `msgid "A " and a \"` + // the parsed string content after unescaping is: + // `A " and a \` + // (both an unescaped inner quote and the single trailing backslash are treated as literals) + List units = + extract( + "test.pot", + """ + msgid "A " and a \\" + msgstr "" + """, + null, + null); + + Assertions.assertThat(units) + .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) + .containsExactly(tuple("A \" and a \\", "A \" and a \\")); + } + + @Test + public void extractWithEscapedAndDanglingBackslash() throws UnsupportedAssetFilterTypeException { + // based on: + // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L292-303 + + // Verifies parity with how Okapi handles malformed escaping in PO files + + // PO snippet + // `msgid "\\\"` + // has 3 backslashes between quotes: escaped backslash (\\) + // followed by a dangling backslash (\) before closing quote. + List units = + extract( + "test.pot", + """ + msgid "\\\\\\" + msgstr "" + """, + null, + null); + + // After unescape: \\ → \ (one backslash), dangling \ stays → total 2 backslashes. + Assertions.assertThat(units) + .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) + .containsExactly(tuple("\\\\", "\\\\")); + } + + @Test + public void extractWithNoDoubleUnescaping() throws UnsupportedAssetFilterTypeException { + List units = + extract( + "test.pot", + // Actual source string in the localized application UI is: + // `Name cannot contain "/", "\", or characters outside the basic multilingual plane.` + // In PO file content, each double quote and backslash is escaped with another + // backslash. + // `msgid "Name cannot contain \"/\", \"\\\", or characters outside the basic + // multilingual plane."` + // Finally, in Java text block we must escape each backslash additionally + """ + msgid "Name cannot contain \\"/\\", \\"\\\\\\", or characters outside the basic multilingual plane." + msgstr "" + """, + null, + null); + + Assertions.assertThat(units) + .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource) + .containsExactly( + // Mojito should unescape correctly matching the actual UI string + // `Name cannot contain "/", "\", or characters outside the basic multilingual plane.` + tuple( + """ + Name cannot contain "/", "\\", or characters outside the basic multilingual plane.\ + """, + """ + Name cannot contain "/", "\\", or characters outside the basic multilingual plane.\ + """)); } } diff --git a/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java b/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java index b1d584dfeb..beef4702c5 100644 --- a/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java +++ b/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java @@ -246,7 +246,7 @@ public void testLocalizePoEscaping() throws Exception { assertEquals(1, textUnitDTOs.size()); TextUnitDTO textUnitDTO = textUnitDTOs.get(0); - assertEquals("repin \\\"{}\\\"", textUnitDTO.getName()); + assertEquals("repin \"{}\"", textUnitDTO.getName()); assertEquals("repin \"{}\"", textUnitDTO.getSource()); String localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP"); @@ -286,7 +286,7 @@ public void testLocalizePoEscaping() throws Exception { assertEquals(1, textUnitDTOs.size()); textUnitDTO = textUnitDTOs.get(0); - assertEquals("repin \\\"{}\\\"", textUnitDTO.getName()); + assertEquals("repin \"{}\"", textUnitDTO.getName()); assertEquals("repin \"{}\"", textUnitDTO.getSource()); assertEquals("repin \"{}\" jp", textUnitDTO.getTarget()); } @@ -668,4 +668,304 @@ public void testImportLocalizedAssetPoPluralSinglePlural() throws Exception { """; importTranslations(repoLocale, localizedAssetContent, StatusForEqualTarget.APPROVED); } + + @Test + public void testLocalizePoEscapedBackslashNewlineTab() throws Exception { + // based on: + // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L241-251 + + Repository repo = createRepository(); + RepositoryLocale repoLocale = addLocale(repo, "ja-JP"); + + String assetContent = + """ + msgid "" + msgstr "" + "Project-Id-Version: PACKAGE VERSION\\n" + "Report-Msgid-Bugs-To: \\n" + "POT-Creation-Date: 2017-09-15 11:53-0500\\n" + "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" + "Last-Translator: FULL NAME \\n" + "Language-Team: LANGUAGE \\n" + "MIME-Version: 1.0\\n" + "Plural-Forms: nplurals=2; plural=(n != 1);\\n" + "Content-Type: text/plain; charset=utf-8\\n" + "Content-Transfer-Encoding: 8bit\\n" + #. Comments + #: src/main.py:10 + msgid "Text \\\\ and \\" and \\n and \\t" + msgstr "" + """; + + String expectedLocalizedAsset = + """ + msgid "" + msgstr "" + "Project-Id-Version: PACKAGE VERSION\\n" + "Report-Msgid-Bugs-To: \\n" + "POT-Creation-Date: 2017-09-15 11:53-0500\\n" + "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" + "Last-Translator: FULL NAME \\n" + "Language-Team: LANGUAGE \\n" + "MIME-Version: 1.0\\n" + "Plural-Forms: nplurals=1; plural=0;\\n" + "Content-Type: text/plain; charset=utf-8\\n" + "Content-Transfer-Encoding: 8bit\\n" + #. Comments + #: src/main.py:10 + msgid "Text \\\\ and \\" and \\n and \\t" + msgstr "Text \\\\ and \\" and \\n and \\t" + """; + + createAsset(repo, "messages.pot", assetContent); + processAsset(repo, assetContent); + + List textUnitDTOs = searchTextUnits(repo); + + assertEquals(1, textUnitDTOs.size()); + TextUnitDTO textUnitDTO = textUnitDTOs.get(0); + assertEquals("Text \\ and \" and \n and \t", textUnitDTO.getName()); + assertEquals("Text \\ and \" and \n and \t", textUnitDTO.getSource()); + + String localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP"); + logger.debug("localized=\n{}", localizedAsset); + assertEquals(expectedLocalizedAsset, localizedAsset); + + String forImport = + """ + msgid "" + msgstr "" + "Project-Id-Version: PACKAGE VERSION\\n" + "Report-Msgid-Bugs-To: \\n" + "POT-Creation-Date: 2017-09-15 11:53-0500\\n" + "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" + "Last-Translator: FULL NAME \\n" + "Language-Team: LANGUAGE \\n" + "MIME-Version: 1.0\\n" + "Plural-Forms: nplurals=1; plural=0;\\n" + "Content-Type: text/plain; charset=utf-8\\n" + "Content-Transfer-Encoding: 8bit\\n" + #. Comments + #: src/main.py:10 + msgid "Text \\\\ and \\" and \\n and \\t" + msgstr "Text \\\\ and \\" and \\n and \\t jp" + """; + + importTranslations(repoLocale, forImport, StatusForEqualTarget.TRANSLATION_NEEDED); + + localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP"); + logger.debug("localized after import=\n{}", localizedAsset); + assertEquals(forImport, localizedAsset); + + TextUnitSearcherParameters textUnitSearcherParameters = new TextUnitSearcherParameters(); + textUnitSearcherParameters.setRepositoryIds(repo.getId()); + textUnitSearcherParameters.setStatusFilter(StatusFilter.TRANSLATED); + textUnitSearcherParameters.setLocaleId(repoLocale.getLocale().getId()); + textUnitDTOs = textUnitSearcher.search(textUnitSearcherParameters); + + assertEquals(1, textUnitDTOs.size()); + textUnitDTO = textUnitDTOs.get(0); + assertEquals("Text \\ and \" and \n and \t", textUnitDTO.getName()); + assertEquals("Text \\ and \" and \n and \t", textUnitDTO.getSource()); + assertEquals("Text \\ and \" and \n and \t jp", textUnitDTO.getTarget()); + } + + @Test + public void testLocalizePoUnescapedRewrite() throws Exception { + // based on: + // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L308-312 + + // Verifies parity with how Okapi handles malformed escaping in PO files + + Repository repo = createRepository(); + RepositoryLocale repoLocale = addLocale(repo, "ja-JP"); + + // Malformed PO: unescaped quote in middle of msgid string. + // PO Parser uses lastIndexOf('"') to find the closing delimiter, + // so given a PO snippet: + // `msgid "A " and a \"` + // the parsed string content after unescaping is: + // `A " and a \` + // (both an unescaped inner quote and the single trailing backslash are treated as literals) + String assetContent = + """ + msgid "" + msgstr "" + "Project-Id-Version: PACKAGE VERSION\\n" + "Report-Msgid-Bugs-To: \\n" + "POT-Creation-Date: 2017-09-15 11:53-0500\\n" + "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" + "Last-Translator: FULL NAME \\n" + "Language-Team: LANGUAGE \\n" + "MIME-Version: 1.0\\n" + "Plural-Forms: nplurals=2; plural=(n != 1);\\n" + "Content-Type: text/plain; charset=utf-8\\n" + "Content-Transfer-Encoding: 8bit\\n" + msgid "A " and a \\" + msgstr "" + """; + + // On localization (source copied to target), encoder should properly re-escape: + // `msgstr "A \" and a \\" + String expectedLocalizedAsset = + """ + msgid "" + msgstr "" + "Project-Id-Version: PACKAGE VERSION\\n" + "Report-Msgid-Bugs-To: \\n" + "POT-Creation-Date: 2017-09-15 11:53-0500\\n" + "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" + "Last-Translator: FULL NAME \\n" + "Language-Team: LANGUAGE \\n" + "MIME-Version: 1.0\\n" + "Plural-Forms: nplurals=1; plural=0;\\n" + "Content-Type: text/plain; charset=utf-8\\n" + "Content-Transfer-Encoding: 8bit\\n" + msgid "A " and a \\" + msgstr "A \\" and a \\\\" + """; + + createAsset(repo, "messages.pot", assetContent); + processAsset(repo, assetContent); + + List textUnitDTOs = searchTextUnits(repo); + + assertEquals(1, textUnitDTOs.size()); + TextUnitDTO textUnitDTO = textUnitDTOs.get(0); + assertEquals( + """ + A " and a \\""", + textUnitDTO.getName()); + assertEquals( + """ + A " and a \\""", + textUnitDTO.getSource()); + + String localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP"); + logger.debug("localized=\n{}", localizedAsset); + assertEquals(expectedLocalizedAsset, localizedAsset); + } + + @Test + public void testLocalizePoEscapingSymmetry() throws Exception { + + Repository repo = createRepository(); + RepositoryLocale repoLocale = addLocale(repo, "ja-JP"); + + // Actual source string in the localized application UI is: + // `Name cannot contain "/", "\", or characters outside the basic multilingual plane.` + // `msgid "Name cannot contain \"/\", \"\\\", or characters outside the basic multilingual + // plane."` + // Finally, in Java text block we must escape each backslash additionally + + String assetContent = + """ + msgid "" + msgstr "" + "Project-Id-Version: PACKAGE VERSION\\n" + "Report-Msgid-Bugs-To: \\n" + "POT-Creation-Date: 2017-09-15 11:53-0500\\n" + "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" + "Last-Translator: FULL NAME \\n" + "Language-Team: LANGUAGE \\n" + "MIME-Version: 1.0\\n" + "Plural-Forms: nplurals=2; plural=(n != 1);\\n" + "Content-Type: text/plain; charset=utf-8\\n" + "Content-Transfer-Encoding: 8bit\\n" + #: src/main.py:10 + msgid "Name cannot contain \\"/\\", \\"\\\\\\", or characters outside the basic multilingual plane." + msgstr "" + """; + + // Target string backfilled from source must match the source exactly, + // i.e. the unescaping when parsing and re-escaping when generating a localized asset + // must be symmetric + String expectedLocalizedAsset = + """ + msgid "" + msgstr "" + "Project-Id-Version: PACKAGE VERSION\\n" + "Report-Msgid-Bugs-To: \\n" + "POT-Creation-Date: 2017-09-15 11:53-0500\\n" + "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" + "Last-Translator: FULL NAME \\n" + "Language-Team: LANGUAGE \\n" + "MIME-Version: 1.0\\n" + "Plural-Forms: nplurals=1; plural=0;\\n" + "Content-Type: text/plain; charset=utf-8\\n" + "Content-Transfer-Encoding: 8bit\\n" + #: src/main.py:10 + msgid "Name cannot contain \\"/\\", \\"\\\\\\", or characters outside the basic multilingual plane." + msgstr "Name cannot contain \\"/\\", \\"\\\\\\", or characters outside the basic multilingual plane." + """; + + createAsset(repo, "messages.pot", assetContent); + processAsset(repo, assetContent); + + List textUnitDTOs = searchTextUnits(repo); + + assertEquals(1, textUnitDTOs.size()); + TextUnitDTO textUnitDTO = textUnitDTOs.get(0); + assertEquals( + """ + Name cannot contain "/", "\\", or characters outside the basic multilingual plane.""", + textUnitDTO.getName()); + assertEquals( + """ + Name cannot contain "/", "\\", or characters outside the basic multilingual plane.""", + textUnitDTO.getSource()); + + // Verify source is copied to target with proper re-escaping + String localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP"); + logger.debug("localized=\n{}", localizedAsset); + assertEquals(expectedLocalizedAsset, localizedAsset); + + // Import a translation where the target has a literal backslash. + // `Nazwa nie może zawierać „/”, „\”, ani znaków spoza podstawowej klawiatury wielojęzycznej.` + // (Note that in Polish translation those are different quote characters which don't need + // escaping) + // Translator should be able to type a literal target string in Mojito UI + // which should then be correctly escaped when generating a localized asset + // In PO file content, the backslash is escaped with another backslash: + // `msgstr "Nazwa nie może zawierać „/”, „\\\\”, ani znaków spoza podstawowej klawiatury + // wielojęzycznej."` + String forImport = + """ + msgid "" + msgstr "" + "Project-Id-Version: PACKAGE VERSION\\n" + "Report-Msgid-Bugs-To: \\n" + "POT-Creation-Date: 2017-09-15 11:53-0500\\n" + "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" + "Last-Translator: FULL NAME \\n" + "Language-Team: LANGUAGE \\n" + "MIME-Version: 1.0\\n" + "Plural-Forms: nplurals=1; plural=0;\\n" + "Content-Type: text/plain; charset=utf-8\\n" + "Content-Transfer-Encoding: 8bit\\n" + #: src/main.py:10 + msgid "Name cannot contain \\"/\\", \\"\\\\\\", or characters outside the basic multilingual plane." + msgstr "Nazwa nie może zawierać „/”, „\\\\”, ani znaków spoza podstawowej klawiatury wielojęzycznej." + """; + + importTranslations(repoLocale, forImport, StatusForEqualTarget.TRANSLATION_NEEDED); + + TextUnitSearcherParameters textUnitSearcherParameters = new TextUnitSearcherParameters(); + textUnitSearcherParameters.setRepositoryIds(repo.getId()); + textUnitSearcherParameters.setStatusFilter(StatusFilter.TRANSLATED); + textUnitSearcherParameters.setLocaleId(repoLocale.getLocale().getId()); + textUnitDTOs = textUnitSearcher.search(textUnitSearcherParameters); + + assertEquals(1, textUnitDTOs.size()); + textUnitDTO = textUnitDTOs.get(0); + // TM should store the unescaped target (single backslash) + assertEquals( + "Nazwa nie może zawierać „/”, „\\”, ani znaków spoza podstawowej klawiatury wielojęzycznej.", + textUnitDTO.getTarget()); + + // Re-generate localized PO — encoder should re-escape \ to \\ + localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP"); + logger.debug("localized after import=\n{}", localizedAsset); + assertEquals(forImport, localizedAsset); + } }