diff --git a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POEncoder.java b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POEncoder.java
deleted file mode 100644
index 6ac81f64c7..0000000000
--- a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POEncoder.java
+++ /dev/null
@@ -1,20 +0,0 @@
-package com.box.l10n.mojito.okapi.filters;
-
-import net.sf.okapi.common.resource.Property;
-
-public class POEncoder extends SimpleEncoder {
-
- @Override
- public String toNative(String propertyName, String value) {
- if (Property.APPROVED.equals(propertyName)) {
- if ((value != null) && (value.equals("no"))) {
- return "fuzzy";
- } else { // Don't set the fuzzy flag
- return "";
- }
- }
-
- // No changes for the other values
- return value;
- }
-}
diff --git a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java
index 73f8670851..b42dcbfdc8 100644
--- a/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java
+++ b/common/src/main/java/com/box/l10n/mojito/okapi/filters/POFilter.java
@@ -1,10 +1,8 @@
package com.box.l10n.mojito.okapi.filters;
-import com.box.l10n.mojito.okapi.TextUnitUtils;
import com.box.l10n.mojito.okapi.steps.OutputDocumentPostProcessingAnnotation;
import com.box.l10n.mojito.po.PoPluralRule;
import com.google.common.collect.Multimap;
-import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
@@ -15,8 +13,6 @@
import java.util.regex.Pattern;
import net.sf.okapi.common.Event;
import net.sf.okapi.common.LocaleId;
-import net.sf.okapi.common.MimeTypeMapper;
-import net.sf.okapi.common.encoder.EncoderManager;
import net.sf.okapi.common.filters.FilterConfiguration;
import net.sf.okapi.common.resource.DocumentPart;
import net.sf.okapi.common.resource.ITextUnit;
@@ -28,55 +24,58 @@
import net.sf.okapi.common.skeleton.GenericSkeletonPart;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Configurable;
-import org.springframework.util.ReflectionUtils;
/**
- * Extends {@link net.sf.okapi.filters.po.POFilter} to somehow support gettext
- * plural and surface message context as part of the textunit name.
- *
- * Maps po plural form to cldr using {@link PoPluralRuleHelper
+ * Extends {@link net.sf.okapi.filters.po.POFilter} to provide human-readable text unit names and
+ * add gettext plural support.
+ *
+ *
Text Unit names
+ *
+ * Okapi's base filter produces text units with generated names. This subclass updates them on the
+ * fly, replacing them with human-readable data, i.e. the source string followed by msgctxt if
+ * present.
+ *
+ * Plural support
+ *
+ * When parsing a plural group, Okapi's base filter simply emits one text unit per each target
+ * variant (msgstr line). This subclass intercepts the event stream to:
+ *
+ *
+ * - Rewrite the Plural-Forms header to match rules for the target locale
+ *
- Map PO plural forms (from msgstr indices) to CLDR forms (zero, one, two, few, many, other)
+ * (via {@link PoPluralRule}) and append to the text unit name
+ *
- Complete missing CLDR forms according to rules for the target locale
+ *
+ *
+ * Other notes
+ *
+ * This filter also extracts source-code reference comments (#:) as usage annotations
*
* @author jaurambualt
*/
@Configurable
public class POFilter extends net.sf.okapi.filters.po.POFilter {
- /** logger */
static Logger logger = LoggerFactory.getLogger(POFilter.class);
public static final String FILTER_CONFIG_ID = "okf_po@mojito";
- static final String USAGE_LOCATION_GROUP_NAME = "location";
- static final String USAGE_LOCATION_PATTERN = "#: (?.*)";
-
// Having " _" as plural separator wasn't very wise... Android filter for example doesn't have a
// space: "_".
// see {@link ThirdPartySyncCommand} too.
// Consider cleanup but that not that simple as it would require migrating data...
static final String PLURAL_SEPARATOR = " _";
- @Autowired TextUnitUtils textUnitUtils;
-
- @Autowired UnescapeUtils unescapeUtils;
+ static final String USAGE_LOCATION_GROUP_NAME = "location";
+ static final String USAGE_LOCATION_PATTERN = "#: (?.*)";
List eventQueue = new ArrayList<>();
LocaleId targetLocale;
-
PoPluralRule poPluralRule;
-
boolean hasCopyFormsOnImport = false;
- String msgIDPlural;
-
- String msgID;
-
- Integer poPluralForm;
-
- EncoderManager encoderManager;
-
@Override
public String getName() {
return FILTER_CONFIG_ID;
@@ -125,135 +124,122 @@ public boolean hasNext() {
return !eventQueue.isEmpty() || super.hasNext();
}
+ /**
+ * The event loop is overridden to intercept events from the parent Okapi POFilter and:
+ *
+ *
+ * - Rewrite the Plural-Forms header when localizing to a target locale
+ *
- Set text unit names to the source content, with msgctxt appended when present
+ *
- Extract source-code reference comments (#:) as usage annotations
+ *
- Buffer and rewrite plural text unit groups via {@link POFilter#processNextPluralGroup}
+ *
+ *
+ * Non-plural events are processed and returned immediately. Plural groups consume multiple parent
+ * events at once and buffer the results in {@link #eventQueue} so they can be yielded one at a
+ * time by subsequent {@link #next()} calls.
+ */
@Override
public Event next() {
- Event event;
-
- if (eventQueue.isEmpty()) {
- readNextEvents();
+ if (!eventQueue.isEmpty()) {
+ return eventQueue.remove(0);
}
- event = eventQueue.remove(0);
+ Event event = super.next();
- return event;
- }
-
- void readNextEvents() {
- Event next = getNextWithProcess();
+ if (isPluralGroupStarting(event)) {
+ processNextPluralGroup(event);
+ return eventQueue.remove(0);
+ }
- if (isPluralGroupStarting(next)) {
- poPluralForm = 0;
- readPlurals(next);
- } else {
- if (next.isDocumentPart()) {
- rewritePluralFormInHeader(next.getDocumentPart());
- }
- eventQueue.add(next);
+ if (event.isDocumentPart()) {
+ rewritePluralFormInHeader(event.getDocumentPart());
}
- }
- void processTextUnit(Event event) {
- if (event != null && event.isTextUnit()) {
+ if (event.isTextUnit()) {
TextUnit textUnit = (TextUnit) event.getTextUnit();
- renameTextUnitWithSourceAndContent(textUnit);
- unescpae(textUnit);
+ setTextUnitName(textUnit, textUnit.getSource().toString());
addUsagesToTextUnit(textUnit);
}
- }
-
- void unescpae(TextUnit textUnit) {
- unescapeSource(textUnit);
- unescapeTarget(textUnit);
- }
-
- void unescapeSource(TextUnit textUnit) {
- String sourceString = textUnitUtils.getSourceAsString(textUnit);
- String unescapedSourceString = unescapeUtils.replaceEscapedQuotes(sourceString);
- textUnitUtils.replaceSourceString(textUnit, unescapedSourceString);
- }
- void unescapeTarget(TextUnit textUnit) {
- TextContainer target = textUnit.getTarget(targetLocale);
- if (target != null) {
- String targetString = target.toString();
- String unescapedTargetString = unescapeUtils.replaceEscapedQuotes(targetString);
- TextContainer newTarget = new TextContainer(unescapedTargetString);
- textUnit.setTarget(targetLocale, newTarget);
- }
+ return event;
}
- boolean isPluralGroupStarting(Event event) {
+ private boolean isPluralGroupStarting(Event event) {
return event != null
&& event.isStartGroup()
&& "x-gettext-plurals".equals(event.getStartGroup().getType());
}
- boolean isPluralGroupEnding(Event event) {
- return poPluralForm != null && event != null && event.isEndGroup();
- }
-
- Event getNextWithProcess() {
- Event next = super.next();
- loadMsgIDFromParent();
- processTextUnit(next);
- return next;
- }
-
- void readPlurals(Event next) {
-
- logger.debug("First event is the start group, load msgidplural from parent and move to next");
- Set usagesFromSkeleton =
- getUsagesFromSkeleton(next.getStartGroup().getSkeleton().toString());
-
- loadMsgIDPluralFromParent();
- eventQueue.add(next);
-
- List pluralEvents = new ArrayList<>();
- next = getNextWithProcess();
-
- // add the start event
- pluralEvents.add(next);
+ /**
+ * Consumes a complete plural group from the parent filter (START_GROUP -> TEXT_UNIT* ->
+ * END_GROUP), remaps PO form indices to CLDR plural forms, completes any missing forms for the
+ * target locale, and buffers all resulting events in {@link #eventQueue}.
+ *
+ * The parent's Okapi POFilter emits one TEXT_UNIT per msgstr[N]. The source on form 0 is the
+ * msgid (singular), and on form 1+ it's msgid_plural. We capture these when processing the group
+ * to set the correct text unit name and source on each plural variant.
+ *
+ *
The name of every plural variant is derived from the singular source (msgid), not from the
+ * variant's own source — this keeps plural text unit names consistent across all forms.
+ *
+ *
The source must also be corrected to handle an edge case in how Okapi assigns sources when
+ * the file has more plural forms than the target locale needs. See {@link
+ * #extractPluralSourceFromSkeleton} for the workaround.
+ */
+ void processNextPluralGroup(Event startGroupEvent) {
+ String startGroupSkeleton = startGroupEvent.getStartGroup().getSkeleton().toString();
+ Set usagesFromSkeleton = getUsagesFromSkeleton(startGroupSkeleton);
- poPluralForm++;
- next = getNextWithProcess();
+ eventQueue.add(startGroupEvent);
- // read others until the end
- while (next != null && !isPluralGroupEnding(next)) {
- pluralEvents.add(next);
- poPluralForm++;
- next = getNextWithProcess();
+ List textUnitEvents = new ArrayList<>();
+ Event next = super.next();
+ while (next != null && !next.isEndGroup()) {
+ if (next.isTextUnit()) {
+ textUnitEvents.add(next);
+ }
+ next = super.next();
}
- poPluralForm = null;
-
- // that doesn't contain last
- pluralEvents = adaptPlurals(pluralEvents, usagesFromSkeleton);
-
- eventQueue.addAll(pluralEvents);
-
- if (isPluralGroupStarting(next)) {
- poPluralForm = 0;
- readPlurals(next);
- } else {
- eventQueue.add(next);
+ String singularSource = textUnitEvents.get(0).getTextUnit().getSource().toString();
+
+ // When nPlurals >= 2 the second text unit's source is the msgid_plural.
+ // When nPlurals == 1 (e.g. Japanese) there is no second text unit, but we still need
+ // msgid_plural — see extractPluralSourceFromSkeleton javadoc for details on the PO/CLDR
+ // mismatch.
+ String pluralSource =
+ textUnitEvents.size() > 1
+ ? textUnitEvents.get(1).getTextUnit().getSource().toString()
+ : extractPluralSourceFromSkeleton(startGroupSkeleton);
+
+ // Text units whose PO form index has no CLDR mapping (e.g. form 1 for Japanese ONE_FORM)
+ // are filtered out
+ List filteredEvents = new ArrayList<>();
+ for (int i = 0; i < textUnitEvents.size(); i++) {
+ String cldrForm = poPluralRule.poFormToCldrForm(Integer.toString(i));
+ if (cldrForm != null) {
+ ITextUnit textUnit = textUnitEvents.get(i).getTextUnit();
+ setTextUnitName(textUnit, singularSource);
+ appendPluralFormToName(textUnit, cldrForm);
+ filteredEvents.add(textUnitEvents.get(i));
+ }
}
- }
- List adaptPlurals(List pluralEvents, Set usagesFromSkeleton) {
- logger.debug("Adapt plural forms if needed");
- PluralsHolder pluralsHolder = new PoPluralsHolder();
- pluralsHolder.loadEvents(pluralEvents);
+ PluralsHolder pluralsHolder = new PoPluralsHolder(singularSource, pluralSource);
+ pluralsHolder.loadEvents(filteredEvents);
List completedForms = pluralsHolder.getCompletedForms(targetLocale);
- setUsagesOnTextUnits(completedForms, usagesFromSkeleton);
- return completedForms;
- }
- private void setUsagesOnTextUnits(List pluralEvents, Set usagesFromSkeleton) {
- for (Event pluralEvent : pluralEvents) {
- if (pluralEvent.isTextUnit()) {
- setUsagesAnnotationOnTextUnit(usagesFromSkeleton, pluralEvent.getTextUnit());
+ for (Event e : completedForms) {
+ if (e.isTextUnit()) {
+ setUsagesAnnotationOnTextUnit(usagesFromSkeleton, e.getTextUnit());
}
}
+
+ eventQueue.addAll(completedForms);
+
+ if (next != null) {
+ eventQueue.add(next);
+ }
}
private void setUsagesAnnotationOnTextUnit(Set usagesFromSkeleton, ITextUnit textUnit) {
@@ -262,20 +248,26 @@ private void setUsagesAnnotationOnTextUnit(Set usagesFromSkeleton, IText
class PoPluralsHolder extends PluralsHolder {
+ private final String singularSource;
+ private final String pluralSource;
+
+ PoPluralsHolder(String singularSource, String pluralSource) {
+ this.singularSource = singularSource;
+ this.pluralSource = pluralSource;
+ }
+
@Override
public List getCompletedForms(LocaleId localeId) {
if (other == null) {
if (few != null) {
- logger.debug(
- "Other is not defined but few is, means it is for a language where few can be copied like Russian");
+ logger.debug("other is not defined, copying from few (e.g. Russian)");
other = createCopyOf(few, "few", "other");
} else if (zero != null) {
- logger.debug(
- "Other and few are not defined but one is, means it is for a language where few can be copied like Arabic");
+ logger.debug("other and few are not defined, copying from zero (e.g. Arabic)");
other = createCopyOf(zero, "zero", "other");
} else if (two != null) {
- logger.debug("Other, few and zero are not defined but one is, copy from two.");
+ logger.debug("other, few and zero are not defined, copying from two");
other = createCopyOf(two, "two", "other");
}
}
@@ -285,18 +277,14 @@ public List getCompletedForms(LocaleId localeId) {
@Override
void adaptTextUnitToCLDRForm(ITextUnit textUnit, String cldrPluralForm) {
-
+ // It's ok to map EN `other` to every non-"one" target category regardless of the target
+ // language rules, because the UI i18n libraries themselves account for cross-language CLDR
+ // category mappings at render time. Mojito just needs to generate enough categories to
+ // satisfy the CLDR rules.
if ("one".equals(cldrPluralForm)) {
- // source should always be singular form for "one" form,
- // this is needed for language with 6 entry like arabic
- logger.debug("Set message singular: {}", msgID);
- textUnit.setSource(new TextContainer(unescapeUtils.replaceEscapedQuotes(msgID)));
+ textUnit.setSource(new TextContainer(singularSource));
} else {
- // source should always be plural form unless for "one" form,
- // this is needed for language with only one entry like
- // japanese: [0] --> other
- logger.debug("Set message plural: {}", msgIDPlural);
- textUnit.setSource(new TextContainer(unescapeUtils.replaceEscapedQuotes(msgIDPlural)));
+ textUnit.setSource(new TextContainer(pluralSource));
}
}
@@ -348,61 +336,101 @@ void retainForms(LocaleId localeId, List pluralForms) {
}
/**
- * If context is present, add it to the text unit name. We keep the generated ID by Okapi for
- * prefix of the text unit name allows to distinguish plural form easily.
- *
- * Note: Decided not to go with empty string id only based the message context to have more
- * consistent IDs and support plural forms. It has a little draw back when searching for string
- * though as it prevents exact match on context.
+ * Sets the text unit name to the given base name, appending the msgctxt when present. Examples:
*
- * @param textUnit
+ *
+ * - {@code "Hello"} — simple entry
+ *
- {@code "File --- menu"} — with msgctxt "menu"
+ *
*/
- void renameTextUnitWithSourceAndContent(ITextUnit textUnit) {
-
+ void setTextUnitName(ITextUnit textUnit, String baseName) {
Property property = textUnit.getProperty(POFilter.PROPERTY_CONTEXT);
- StringBuilder newName = new StringBuilder(msgID);
+ StringBuilder newName = new StringBuilder(baseName);
if (property != null) {
newName.append(" --- ").append(property.getValue());
}
- if (poPluralForm != null) {
- String cldrForm = poPluralRule.poFormToCldrForm(Integer.toString(poPluralForm));
- newName.append(PLURAL_SEPARATOR).append(cldrForm);
- }
-
textUnit.setName(newName.toString());
}
/**
- * Rewrite the plural forms if processing a target locale.
- *
- * @param documentPart
+ * Appends a CLDR plural form suffix to the text unit's current name, e.g. {@code "item"} becomes
+ * {@code "item _other"}. The suffix must be set before {@link PluralsHolder#loadEvents} so that
+ * {@link PluralsHolder#getCldrPluralFormOfEvent} can parse the form back from the name.
*/
+ void appendPluralFormToName(ITextUnit textUnit, String cldrPluralForm) {
+ textUnit.setName(textUnit.getName() + PLURAL_SEPARATOR + cldrPluralForm);
+ }
+
void rewritePluralFormInHeader(DocumentPart documentPart) {
if (targetLocale != null && !LocaleId.EMPTY.equals(targetLocale)) {
documentPart.setProperty(new Property("pluralforms", poPluralRule.getRule()));
}
}
- void loadMsgIDPluralFromParent() {
- Field msgIDPluralParent =
- ReflectionUtils.findField(net.sf.okapi.filters.po.POFilter.class, "msgIDPlural");
- msgIDPlural = makeAccessibleAndGetString(msgIDPluralParent);
+ /**
+ * Extracts and unescapes the {@code msgid_plural} value from the START_GROUP skeleton.
+ *
+ * This is needed because of a mismatch between PO and CLDR plural models. Okapi assigns
+ * sources by PO index: {@code msgstr[0]} always gets {@code msgid} (singular) and {@code
+ * msgstr[1+]} gets {@code msgid_plural}. But CLDR's "other" form — which should have the plural
+ * source — can map to {@code msgstr[0]} for locales with a single plural form (e.g. Japanese
+ * nPlurals=1). In that case Okapi only emits one TEXT_UNIT with the singular source, yet we need
+ * the plural source to match the text unit that extraction stored in the database (extraction is
+ * locale-independent and always assigns {@code msgid_plural} as the source for the "other" form).
+ *
+ *
The skeleton always contains the raw {@code msgid_plural "..."} line(s) since that is what
+ * triggers the START_GROUP event in Okapi's POFilter. The raw quoted content is extracted here
+ * and unescaped via the parent's {@code unescape} method (accessed through reflection because it
+ * is private with no public alternative).
+ */
+ String extractPluralSourceFromSkeleton(String skeleton) {
+ StringBuilder raw = new StringBuilder();
+ boolean inMsgIdPlural = false;
+
+ for (String line : skeleton.split("\\R")) {
+ String trimmed = line.trim();
+ if (trimmed.startsWith("msgid_plural")) {
+ inMsgIdPlural = true;
+ appendQuotedContent(raw, line);
+ } else if (inMsgIdPlural && trimmed.startsWith("\"")) {
+ appendQuotedContent(raw, line);
+ } else if (inMsgIdPlural) {
+ break;
+ }
+ }
+
+ return unescapePo(raw.toString());
}
- void loadMsgIDFromParent() {
- Field msgIDParent = ReflectionUtils.findField(net.sf.okapi.filters.po.POFilter.class, "msgID");
- msgID = makeAccessibleAndGetString(msgIDParent);
+ private static void appendQuotedContent(StringBuilder sb, String line) {
+ int first = line.indexOf('"');
+ int last = line.lastIndexOf('"');
+ if (first >= 0 && last > first) {
+ sb.append(line, first + 1, last);
+ }
}
- String makeAccessibleAndGetString(Field msgID) {
- ReflectionUtils.makeAccessible(msgID);
+ /** Delegates to the parent's private {@code unescape} method via reflection. */
+ private String unescapePo(String text) {
try {
- return (String) msgID.get(this);
- } catch (IllegalAccessException | IllegalArgumentException e) {
- throw new RuntimeException(e);
+ return (String) PARENT_UNESCAPE.invoke(this, text);
+ } catch (ReflectiveOperationException e) {
+ throw new RuntimeException("Failed to invoke Okapi POFilter.unescape", e);
+ }
+ }
+
+ private static final java.lang.reflect.Method PARENT_UNESCAPE;
+
+ static {
+ try {
+ PARENT_UNESCAPE =
+ net.sf.okapi.filters.po.POFilter.class.getDeclaredMethod("unescape", String.class);
+ PARENT_UNESCAPE.setAccessible(true);
+ } catch (NoSuchMethodException e) {
+ throw new ExceptionInInitializerError(e);
}
}
@@ -425,16 +453,6 @@ Set getUsagesFromSkeleton(String skeleton) {
return locations;
}
- @Override
- public EncoderManager getEncoderManager() {
- if (encoderManager == null) {
- encoderManager = new EncoderManager();
- encoderManager.setMapping(
- MimeTypeMapper.PO_MIME_TYPE, "com.box.l10n.mojito.okapi.filters.POEncoder");
- }
- return encoderManager;
- }
-
static String removeUntranslated(String poFile) {
StringBuilder poFileWithoutUntranslatedPlural = new StringBuilder();
Scanner scanner = new Scanner(poFile);
diff --git a/common/src/test/java/com/box/l10n/mojito/okapi/filters/POFilterTest.java b/common/src/test/java/com/box/l10n/mojito/okapi/filters/POFilterTest.java
index c72a1f13d5..e98ebeca0a 100644
--- a/common/src/test/java/com/box/l10n/mojito/okapi/filters/POFilterTest.java
+++ b/common/src/test/java/com/box/l10n/mojito/okapi/filters/POFilterTest.java
@@ -1,7 +1,6 @@
package com.box.l10n.mojito.okapi.filters;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
import java.util.ArrayList;
import java.util.List;
@@ -13,20 +12,6 @@
*/
public class POFilterTest {
- @Test
- public void loadMsgIDPluralFromParent() {
- POFilter poFilter = new POFilter();
- poFilter.loadMsgIDPluralFromParent();
- assertNull(poFilter.msgIDPlural);
- }
-
- @Test
- public void loadMsgIDFromParent() {
- POFilter poFilter = new POFilter();
- poFilter.loadMsgIDFromParent();
- assertNull(poFilter.msgID);
- }
-
@Test
public void getUsagesFromSkeleton() {
String skeleton =
diff --git a/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java b/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java
index 31d826486e..dc34e3f2e5 100644
--- a/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java
+++ b/common/src/test/java/com/box/l10n/mojito/okapi/filters/extraction/POFilterExtractionTest.java
@@ -64,11 +64,13 @@ public void extractWithEscapedDoubleQuotes() throws UnsupportedAssetFilterTypeEx
Assertions.assertThat(units)
.extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource)
- .containsExactly(tuple("He said \\\"hello\\\"", "He said \"hello\""));
+ .containsExactly(tuple("He said \"hello\"", "He said \"hello\""));
}
@Test
- public void extractWithEscapedSingleQuotes() throws UnsupportedAssetFilterTypeException {
+ public void extractWithBackslashAndSingleQuote() throws UnsupportedAssetFilterTypeException {
+ // \' is not a standard PO escape sequence (PO uses double-quote delimiters),
+ // so the backslash is preserved as a literal character by Okapi's parser.
List units =
extract(
"test.pot",
@@ -81,7 +83,7 @@ public void extractWithEscapedSingleQuotes() throws UnsupportedAssetFilterTypeEx
Assertions.assertThat(units)
.extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource)
- .containsExactly(tuple("it\\'s a test", "it's a test"));
+ .containsExactly(tuple("it\\'s a test", "it\\'s a test"));
}
@Test
@@ -157,11 +159,159 @@ public void extractPluralsWithEscapedQuotes() throws UnsupportedAssetFilterTypeE
AssetExtractorTextUnit::getPluralForm,
AssetExtractorTextUnit::getPluralFormOther)
.containsExactly(
- tuple("the \\\"item\\\" _zero", "the \"items\"", "zero", "the \\\"item\\\" _other"),
- tuple("the \\\"item\\\" _one", "the \"item\"", "one", "the \\\"item\\\" _other"),
- tuple("the \\\"item\\\" _two", "the \"items\"", "two", "the \\\"item\\\" _other"),
- tuple("the \\\"item\\\" _few", "the \"items\"", "few", "the \\\"item\\\" _other"),
- tuple("the \\\"item\\\" _many", "the \"items\"", "many", "the \\\"item\\\" _other"),
- tuple("the \\\"item\\\" _other", "the \"items\"", "other", "the \\\"item\\\" _other"));
+ tuple("the \"item\" _zero", "the \"items\"", "zero", "the \"item\" _other"),
+ tuple("the \"item\" _one", "the \"item\"", "one", "the \"item\" _other"),
+ tuple("the \"item\" _two", "the \"items\"", "two", "the \"item\" _other"),
+ tuple("the \"item\" _few", "the \"items\"", "few", "the \"item\" _other"),
+ tuple("the \"item\" _many", "the \"items\"", "many", "the \"item\" _other"),
+ tuple("the \"item\" _other", "the \"items\"", "other", "the \"item\" _other"));
+ }
+
+ @Test
+ public void extractWithEscapedBackslashQuoteNewlineTab()
+ throws UnsupportedAssetFilterTypeException {
+ // based on:
+ // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L241-251
+ List units =
+ extract(
+ "test.pot",
+ """
+ msgid "Text \\\\ and \\" and \\n and \\t"
+ msgstr ""
+ """,
+ null,
+ null);
+
+ Assertions.assertThat(units)
+ .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource)
+ .containsExactly(tuple("Text \\ and \" and \n and \t", "Text \\ and \" and \n and \t"));
+ }
+
+ @Test
+ public void extractWithEscapedFormFeedBackspaceCarriageReturn()
+ throws UnsupportedAssetFilterTypeException {
+ // based on:
+ // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L253-263
+ List units =
+ extract(
+ "test.pot",
+ """
+ msgid "Text \\f and \\b and \\rr"
+ msgstr ""
+ """,
+ null,
+ null);
+
+ Assertions.assertThat(units)
+ .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource)
+ .containsExactly(tuple("Text \f and \b and \rr", "Text \f and \b and \rr"));
+ }
+
+ @Test
+ public void extractWithEscapeAtStartOfString() throws UnsupportedAssetFilterTypeException {
+ // based on:
+ // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L265-276
+ List units =
+ extract(
+ "test.pot",
+ """
+ msgid "\\t."
+ msgstr ""
+ """,
+ null,
+ null);
+
+ Assertions.assertThat(units)
+ .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource)
+ .containsExactly(tuple("\t.", "\t."));
+ }
+
+ @Test
+ public void extractWithUnescapedQuoteInMiddle() throws UnsupportedAssetFilterTypeException {
+ // based on:
+ // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L280-290
+
+ // Verifies parity with how Okapi handles malformed escaping in PO files
+
+ // Malformed PO: unescaped quote in middle of msgid string.
+ // PO Parser uses lastIndexOf('"') to find the closing delimiter,
+ // so given a PO snippet:
+ // `msgid "A " and a \"`
+ // the parsed string content after unescaping is:
+ // `A " and a \`
+ // (both an unescaped inner quote and the single trailing backslash are treated as literals)
+ List units =
+ extract(
+ "test.pot",
+ """
+ msgid "A " and a \\"
+ msgstr ""
+ """,
+ null,
+ null);
+
+ Assertions.assertThat(units)
+ .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource)
+ .containsExactly(tuple("A \" and a \\", "A \" and a \\"));
+ }
+
+ @Test
+ public void extractWithEscapedAndDanglingBackslash() throws UnsupportedAssetFilterTypeException {
+ // based on:
+ // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L292-303
+
+ // Verifies parity with how Okapi handles malformed escaping in PO files
+
+ // PO snippet
+ // `msgid "\\\"`
+ // has 3 backslashes between quotes: escaped backslash (\\)
+ // followed by a dangling backslash (\) before closing quote.
+ List units =
+ extract(
+ "test.pot",
+ """
+ msgid "\\\\\\"
+ msgstr ""
+ """,
+ null,
+ null);
+
+ // After unescape: \\ → \ (one backslash), dangling \ stays → total 2 backslashes.
+ Assertions.assertThat(units)
+ .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource)
+ .containsExactly(tuple("\\\\", "\\\\"));
+ }
+
+ @Test
+ public void extractWithNoDoubleUnescaping() throws UnsupportedAssetFilterTypeException {
+ List units =
+ extract(
+ "test.pot",
+ // Actual source string in the localized application UI is:
+ // `Name cannot contain "/", "\", or characters outside the basic multilingual plane.`
+ // In PO file content, each double quote and backslash is escaped with another
+ // backslash.
+ // `msgid "Name cannot contain \"/\", \"\\\", or characters outside the basic
+ // multilingual plane."`
+ // Finally, in Java text block we must escape each backslash additionally
+ """
+ msgid "Name cannot contain \\"/\\", \\"\\\\\\", or characters outside the basic multilingual plane."
+ msgstr ""
+ """,
+ null,
+ null);
+
+ Assertions.assertThat(units)
+ .extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource)
+ .containsExactly(
+ // Mojito should unescape correctly matching the actual UI string
+ // `Name cannot contain "/", "\", or characters outside the basic multilingual plane.`
+ tuple(
+ """
+ Name cannot contain "/", "\\", or characters outside the basic multilingual plane.\
+ """,
+ """
+ Name cannot contain "/", "\\", or characters outside the basic multilingual plane.\
+ """));
}
}
diff --git a/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java b/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java
index b1d584dfeb..beef4702c5 100644
--- a/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java
+++ b/webapp/src/test/java/com/box/l10n/mojito/service/tm/localizeasset/LocalizePoAssetTest.java
@@ -246,7 +246,7 @@ public void testLocalizePoEscaping() throws Exception {
assertEquals(1, textUnitDTOs.size());
TextUnitDTO textUnitDTO = textUnitDTOs.get(0);
- assertEquals("repin \\\"{}\\\"", textUnitDTO.getName());
+ assertEquals("repin \"{}\"", textUnitDTO.getName());
assertEquals("repin \"{}\"", textUnitDTO.getSource());
String localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP");
@@ -286,7 +286,7 @@ public void testLocalizePoEscaping() throws Exception {
assertEquals(1, textUnitDTOs.size());
textUnitDTO = textUnitDTOs.get(0);
- assertEquals("repin \\\"{}\\\"", textUnitDTO.getName());
+ assertEquals("repin \"{}\"", textUnitDTO.getName());
assertEquals("repin \"{}\"", textUnitDTO.getSource());
assertEquals("repin \"{}\" jp", textUnitDTO.getTarget());
}
@@ -668,4 +668,304 @@ public void testImportLocalizedAssetPoPluralSinglePlural() throws Exception {
""";
importTranslations(repoLocale, localizedAssetContent, StatusForEqualTarget.APPROVED);
}
+
+ @Test
+ public void testLocalizePoEscapedBackslashNewlineTab() throws Exception {
+ // based on:
+ // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L241-251
+
+ Repository repo = createRepository();
+ RepositoryLocale repoLocale = addLocale(repo, "ja-JP");
+
+ String assetContent =
+ """
+ msgid ""
+ msgstr ""
+ "Project-Id-Version: PACKAGE VERSION\\n"
+ "Report-Msgid-Bugs-To: \\n"
+ "POT-Creation-Date: 2017-09-15 11:53-0500\\n"
+ "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
+ "Last-Translator: FULL NAME \\n"
+ "Language-Team: LANGUAGE \\n"
+ "MIME-Version: 1.0\\n"
+ "Plural-Forms: nplurals=2; plural=(n != 1);\\n"
+ "Content-Type: text/plain; charset=utf-8\\n"
+ "Content-Transfer-Encoding: 8bit\\n"
+ #. Comments
+ #: src/main.py:10
+ msgid "Text \\\\ and \\" and \\n and \\t"
+ msgstr ""
+ """;
+
+ String expectedLocalizedAsset =
+ """
+ msgid ""
+ msgstr ""
+ "Project-Id-Version: PACKAGE VERSION\\n"
+ "Report-Msgid-Bugs-To: \\n"
+ "POT-Creation-Date: 2017-09-15 11:53-0500\\n"
+ "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
+ "Last-Translator: FULL NAME \\n"
+ "Language-Team: LANGUAGE \\n"
+ "MIME-Version: 1.0\\n"
+ "Plural-Forms: nplurals=1; plural=0;\\n"
+ "Content-Type: text/plain; charset=utf-8\\n"
+ "Content-Transfer-Encoding: 8bit\\n"
+ #. Comments
+ #: src/main.py:10
+ msgid "Text \\\\ and \\" and \\n and \\t"
+ msgstr "Text \\\\ and \\" and \\n and \\t"
+ """;
+
+ createAsset(repo, "messages.pot", assetContent);
+ processAsset(repo, assetContent);
+
+ List textUnitDTOs = searchTextUnits(repo);
+
+ assertEquals(1, textUnitDTOs.size());
+ TextUnitDTO textUnitDTO = textUnitDTOs.get(0);
+ assertEquals("Text \\ and \" and \n and \t", textUnitDTO.getName());
+ assertEquals("Text \\ and \" and \n and \t", textUnitDTO.getSource());
+
+ String localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP");
+ logger.debug("localized=\n{}", localizedAsset);
+ assertEquals(expectedLocalizedAsset, localizedAsset);
+
+ String forImport =
+ """
+ msgid ""
+ msgstr ""
+ "Project-Id-Version: PACKAGE VERSION\\n"
+ "Report-Msgid-Bugs-To: \\n"
+ "POT-Creation-Date: 2017-09-15 11:53-0500\\n"
+ "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
+ "Last-Translator: FULL NAME \\n"
+ "Language-Team: LANGUAGE \\n"
+ "MIME-Version: 1.0\\n"
+ "Plural-Forms: nplurals=1; plural=0;\\n"
+ "Content-Type: text/plain; charset=utf-8\\n"
+ "Content-Transfer-Encoding: 8bit\\n"
+ #. Comments
+ #: src/main.py:10
+ msgid "Text \\\\ and \\" and \\n and \\t"
+ msgstr "Text \\\\ and \\" and \\n and \\t jp"
+ """;
+
+ importTranslations(repoLocale, forImport, StatusForEqualTarget.TRANSLATION_NEEDED);
+
+ localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP");
+ logger.debug("localized after import=\n{}", localizedAsset);
+ assertEquals(forImport, localizedAsset);
+
+ TextUnitSearcherParameters textUnitSearcherParameters = new TextUnitSearcherParameters();
+ textUnitSearcherParameters.setRepositoryIds(repo.getId());
+ textUnitSearcherParameters.setStatusFilter(StatusFilter.TRANSLATED);
+ textUnitSearcherParameters.setLocaleId(repoLocale.getLocale().getId());
+ textUnitDTOs = textUnitSearcher.search(textUnitSearcherParameters);
+
+ assertEquals(1, textUnitDTOs.size());
+ textUnitDTO = textUnitDTOs.get(0);
+ assertEquals("Text \\ and \" and \n and \t", textUnitDTO.getName());
+ assertEquals("Text \\ and \" and \n and \t", textUnitDTO.getSource());
+ assertEquals("Text \\ and \" and \n and \t jp", textUnitDTO.getTarget());
+ }
+
+ @Test
+ public void testLocalizePoUnescapedRewrite() throws Exception {
+ // based on:
+ // https://gitlab.com/okapiframework/Okapi/-/blob/v1.45.0/okapi/filters/po/src/test/java/net/sf/okapi/filters/po/POFilterTest.java#L308-312
+
+ // Verifies parity with how Okapi handles malformed escaping in PO files
+
+ Repository repo = createRepository();
+ RepositoryLocale repoLocale = addLocale(repo, "ja-JP");
+
+ // Malformed PO: unescaped quote in middle of msgid string.
+ // PO Parser uses lastIndexOf('"') to find the closing delimiter,
+ // so given a PO snippet:
+ // `msgid "A " and a \"`
+ // the parsed string content after unescaping is:
+ // `A " and a \`
+ // (both an unescaped inner quote and the single trailing backslash are treated as literals)
+ String assetContent =
+ """
+ msgid ""
+ msgstr ""
+ "Project-Id-Version: PACKAGE VERSION\\n"
+ "Report-Msgid-Bugs-To: \\n"
+ "POT-Creation-Date: 2017-09-15 11:53-0500\\n"
+ "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
+ "Last-Translator: FULL NAME \\n"
+ "Language-Team: LANGUAGE \\n"
+ "MIME-Version: 1.0\\n"
+ "Plural-Forms: nplurals=2; plural=(n != 1);\\n"
+ "Content-Type: text/plain; charset=utf-8\\n"
+ "Content-Transfer-Encoding: 8bit\\n"
+ msgid "A " and a \\"
+ msgstr ""
+ """;
+
+ // On localization (source copied to target), encoder should properly re-escape:
+ // `msgstr "A \" and a \\"
+ String expectedLocalizedAsset =
+ """
+ msgid ""
+ msgstr ""
+ "Project-Id-Version: PACKAGE VERSION\\n"
+ "Report-Msgid-Bugs-To: \\n"
+ "POT-Creation-Date: 2017-09-15 11:53-0500\\n"
+ "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
+ "Last-Translator: FULL NAME \\n"
+ "Language-Team: LANGUAGE \\n"
+ "MIME-Version: 1.0\\n"
+ "Plural-Forms: nplurals=1; plural=0;\\n"
+ "Content-Type: text/plain; charset=utf-8\\n"
+ "Content-Transfer-Encoding: 8bit\\n"
+ msgid "A " and a \\"
+ msgstr "A \\" and a \\\\"
+ """;
+
+ createAsset(repo, "messages.pot", assetContent);
+ processAsset(repo, assetContent);
+
+ List textUnitDTOs = searchTextUnits(repo);
+
+ assertEquals(1, textUnitDTOs.size());
+ TextUnitDTO textUnitDTO = textUnitDTOs.get(0);
+ assertEquals(
+ """
+ A " and a \\""",
+ textUnitDTO.getName());
+ assertEquals(
+ """
+ A " and a \\""",
+ textUnitDTO.getSource());
+
+ String localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP");
+ logger.debug("localized=\n{}", localizedAsset);
+ assertEquals(expectedLocalizedAsset, localizedAsset);
+ }
+
+ @Test
+ public void testLocalizePoEscapingSymmetry() throws Exception {
+
+ Repository repo = createRepository();
+ RepositoryLocale repoLocale = addLocale(repo, "ja-JP");
+
+ // Actual source string in the localized application UI is:
+ // `Name cannot contain "/", "\", or characters outside the basic multilingual plane.`
+ // `msgid "Name cannot contain \"/\", \"\\\", or characters outside the basic multilingual
+ // plane."`
+ // Finally, in Java text block we must escape each backslash additionally
+
+ String assetContent =
+ """
+ msgid ""
+ msgstr ""
+ "Project-Id-Version: PACKAGE VERSION\\n"
+ "Report-Msgid-Bugs-To: \\n"
+ "POT-Creation-Date: 2017-09-15 11:53-0500\\n"
+ "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
+ "Last-Translator: FULL NAME \\n"
+ "Language-Team: LANGUAGE \\n"
+ "MIME-Version: 1.0\\n"
+ "Plural-Forms: nplurals=2; plural=(n != 1);\\n"
+ "Content-Type: text/plain; charset=utf-8\\n"
+ "Content-Transfer-Encoding: 8bit\\n"
+ #: src/main.py:10
+ msgid "Name cannot contain \\"/\\", \\"\\\\\\", or characters outside the basic multilingual plane."
+ msgstr ""
+ """;
+
+ // Target string backfilled from source must match the source exactly,
+ // i.e. the unescaping when parsing and re-escaping when generating a localized asset
+ // must be symmetric
+ String expectedLocalizedAsset =
+ """
+ msgid ""
+ msgstr ""
+ "Project-Id-Version: PACKAGE VERSION\\n"
+ "Report-Msgid-Bugs-To: \\n"
+ "POT-Creation-Date: 2017-09-15 11:53-0500\\n"
+ "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
+ "Last-Translator: FULL NAME \\n"
+ "Language-Team: LANGUAGE \\n"
+ "MIME-Version: 1.0\\n"
+ "Plural-Forms: nplurals=1; plural=0;\\n"
+ "Content-Type: text/plain; charset=utf-8\\n"
+ "Content-Transfer-Encoding: 8bit\\n"
+ #: src/main.py:10
+ msgid "Name cannot contain \\"/\\", \\"\\\\\\", or characters outside the basic multilingual plane."
+ msgstr "Name cannot contain \\"/\\", \\"\\\\\\", or characters outside the basic multilingual plane."
+ """;
+
+ createAsset(repo, "messages.pot", assetContent);
+ processAsset(repo, assetContent);
+
+ List textUnitDTOs = searchTextUnits(repo);
+
+ assertEquals(1, textUnitDTOs.size());
+ TextUnitDTO textUnitDTO = textUnitDTOs.get(0);
+ assertEquals(
+ """
+ Name cannot contain "/", "\\", or characters outside the basic multilingual plane.""",
+ textUnitDTO.getName());
+ assertEquals(
+ """
+ Name cannot contain "/", "\\", or characters outside the basic multilingual plane.""",
+ textUnitDTO.getSource());
+
+ // Verify source is copied to target with proper re-escaping
+ String localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP");
+ logger.debug("localized=\n{}", localizedAsset);
+ assertEquals(expectedLocalizedAsset, localizedAsset);
+
+ // Import a translation where the target has a literal backslash.
+ // `Nazwa nie może zawierać „/”, „\”, ani znaków spoza podstawowej klawiatury wielojęzycznej.`
+ // (Note that in Polish translation those are different quote characters which don't need
+ // escaping)
+ // Translator should be able to type a literal target string in Mojito UI
+ // which should then be correctly escaped when generating a localized asset
+ // In PO file content, the backslash is escaped with another backslash:
+ // `msgstr "Nazwa nie może zawierać „/”, „\\\\”, ani znaków spoza podstawowej klawiatury
+ // wielojęzycznej."`
+ String forImport =
+ """
+ msgid ""
+ msgstr ""
+ "Project-Id-Version: PACKAGE VERSION\\n"
+ "Report-Msgid-Bugs-To: \\n"
+ "POT-Creation-Date: 2017-09-15 11:53-0500\\n"
+ "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
+ "Last-Translator: FULL NAME \\n"
+ "Language-Team: LANGUAGE \\n"
+ "MIME-Version: 1.0\\n"
+ "Plural-Forms: nplurals=1; plural=0;\\n"
+ "Content-Type: text/plain; charset=utf-8\\n"
+ "Content-Transfer-Encoding: 8bit\\n"
+ #: src/main.py:10
+ msgid "Name cannot contain \\"/\\", \\"\\\\\\", or characters outside the basic multilingual plane."
+ msgstr "Nazwa nie może zawierać „/”, „\\\\”, ani znaków spoza podstawowej klawiatury wielojęzycznej."
+ """;
+
+ importTranslations(repoLocale, forImport, StatusForEqualTarget.TRANSLATION_NEEDED);
+
+ TextUnitSearcherParameters textUnitSearcherParameters = new TextUnitSearcherParameters();
+ textUnitSearcherParameters.setRepositoryIds(repo.getId());
+ textUnitSearcherParameters.setStatusFilter(StatusFilter.TRANSLATED);
+ textUnitSearcherParameters.setLocaleId(repoLocale.getLocale().getId());
+ textUnitDTOs = textUnitSearcher.search(textUnitSearcherParameters);
+
+ assertEquals(1, textUnitDTOs.size());
+ textUnitDTO = textUnitDTOs.get(0);
+ // TM should store the unescaped target (single backslash)
+ assertEquals(
+ "Nazwa nie może zawierać „/”, „\\”, ani znaków spoza podstawowej klawiatury wielojęzycznej.",
+ textUnitDTO.getTarget());
+
+ // Re-generate localized PO — encoder should re-escape \ to \\
+ localizedAsset = generateLocalized(assetContent, repoLocale, "ja-JP");
+ logger.debug("localized after import=\n{}", localizedAsset);
+ assertEquals(forImport, localizedAsset);
+ }
}