diff --git a/cli/src/main/java/com/box/l10n/mojito/cli/command/PseudoLocCommand.java b/cli/src/main/java/com/box/l10n/mojito/cli/command/PseudoLocCommand.java index 6a554c5948..1627841c93 100644 --- a/cli/src/main/java/com/box/l10n/mojito/cli/command/PseudoLocCommand.java +++ b/cli/src/main/java/com/box/l10n/mojito/cli/command/PseudoLocCommand.java @@ -99,6 +99,16 @@ public class PseudoLocCommand extends Command { description = Param.DIR_PATH_EXCLUDE_PATTERNS_DESCRIPTION) List directoriesExcludePatterns = null; + @Parameter( + names = {"--substitute"}, + arity = 1, + required = false, + description = + "Character substitution mode: RANDOM (default) picks a random diacritical replacement each time," + + " CONSISTENT always maps to the same replacement within a given string.", + converter = SubstituteTypeConverter.class) + LocalizedAssetBody.SubstituteType substituteType = LocalizedAssetBody.SubstituteType.RANDOM; + @Autowired AssetClient assetClient; @Autowired CommandHelper commandHelper; @@ -192,7 +202,8 @@ LocalizedAssetBody getPseudoLocalizedAsset( assetByPathAndRepositoryId.getId(), assetContent, sourceFileMatch.getFileType().getFilterConfigIdOverride(), - filterOptions); + filterOptions, + substituteType); logger.trace("PseudoLocalizedAsset content = {}", pseudoLocalizedAsset.getContent()); return pseudoLocalizedAsset; diff --git a/cli/src/main/java/com/box/l10n/mojito/cli/command/SubstituteTypeConverter.java b/cli/src/main/java/com/box/l10n/mojito/cli/command/SubstituteTypeConverter.java new file mode 100644 index 0000000000..7d89b8edc6 --- /dev/null +++ b/cli/src/main/java/com/box/l10n/mojito/cli/command/SubstituteTypeConverter.java @@ -0,0 +1,11 @@ +package com.box.l10n.mojito.cli.command; + +import com.box.l10n.mojito.rest.entity.LocalizedAssetBody; + +public class SubstituteTypeConverter extends EnumConverter { + + @Override + protected Class getGenericClass() { + return LocalizedAssetBody.SubstituteType.class; + } +} diff --git a/restclient/src/main/java/com/box/l10n/mojito/rest/client/AssetClient.java b/restclient/src/main/java/com/box/l10n/mojito/rest/client/AssetClient.java index 8ca90a7086..eafbd249f3 100644 --- a/restclient/src/main/java/com/box/l10n/mojito/rest/client/AssetClient.java +++ b/restclient/src/main/java/com/box/l10n/mojito/rest/client/AssetClient.java @@ -209,13 +209,15 @@ public PollableTask getLocalizedAssetForContentAsync( * @param filterConfigIdOverride Optional, can be null. Allows to specify a specific Okapi filter * to use to process the asset * @param filterOptions + * @param substituteType Allows to choose * @return the pseudoloocalized asset content */ public LocalizedAssetBody getPseudoLocalizedAssetForContent( Long assetId, String content, FilterConfigIdOverride filterConfigIdOverride, - List filterOptions) { + List filterOptions, + LocalizedAssetBody.SubstituteType substituteType) { UriComponentsBuilder uriBuilder = UriComponentsBuilder.fromPath(getBasePathForResource(assetId, "pseudo")); @@ -225,6 +227,7 @@ public LocalizedAssetBody getPseudoLocalizedAssetForContent( localizedAssetBody.setOutputBcp47tag(OUTPUT_BCP47_TAG); localizedAssetBody.setFilterConfigIdOverride(filterConfigIdOverride); localizedAssetBody.setFilterOptions(filterOptions); + localizedAssetBody.setSubstituteType(substituteType); return authenticatedRestTemplate.postForObject( uriBuilder.toUriString(), localizedAssetBody, LocalizedAssetBody.class); diff --git a/restclient/src/main/java/com/box/l10n/mojito/rest/entity/LocalizedAssetBody.java b/restclient/src/main/java/com/box/l10n/mojito/rest/entity/LocalizedAssetBody.java index d6983feed6..6fea782f9b 100644 --- a/restclient/src/main/java/com/box/l10n/mojito/rest/entity/LocalizedAssetBody.java +++ b/restclient/src/main/java/com/box/l10n/mojito/rest/entity/LocalizedAssetBody.java @@ -26,6 +26,17 @@ public enum Status { ACCEPTED } + /** + * During pseudolocalization, specifies how accent/diacritics characters that replace ASCII + * letters are chosen. + */ + public enum SubstituteType { + /** Replacement characters are picked at random. */ + RANDOM, + /** Replacement characters are picked consistently for a given string. */ + CONSISTENT + } + /** Asset id */ Long assetId; @@ -66,6 +77,8 @@ public enum Status { Status status = Status.ALL; + SubstituteType substituteType; + public Long getAssetId() { return assetId; } @@ -145,4 +158,12 @@ public String getPullRunName() { public void setPullRunName(String pullRunName) { this.pullRunName = pullRunName; } + + public SubstituteType getSubstituteType() { + return substituteType; + } + + public void setSubstituteType(SubstituteType substituteType) { + this.substituteType = substituteType; + } } diff --git a/webapp/src/main/java/com/box/l10n/mojito/okapi/PseudoLocalizeStep.java b/webapp/src/main/java/com/box/l10n/mojito/okapi/PseudoLocalizeStep.java index b6faed2acd..d17a6eced6 100644 --- a/webapp/src/main/java/com/box/l10n/mojito/okapi/PseudoLocalizeStep.java +++ b/webapp/src/main/java/com/box/l10n/mojito/okapi/PseudoLocalizeStep.java @@ -2,6 +2,7 @@ import com.box.l10n.mojito.entity.Asset; import com.box.l10n.mojito.pseudoloc.PseudoLocalization; +import com.box.l10n.mojito.pseudoloc.PseudoLocalization.SubstituteType; import com.box.l10n.mojito.service.assetintegritychecker.integritychecker.IntegrityCheckerFactory; import com.box.l10n.mojito.service.assetintegritychecker.integritychecker.TextUnitIntegrityChecker; import com.box.l10n.mojito.service.tm.TMTextUnitRepository; @@ -29,11 +30,17 @@ public class PseudoLocalizeStep extends BasePipelineStep { static Logger logger = LoggerFactory.getLogger(PseudoLocalizeStep.class); private Asset asset; + private SubstituteType substituteType; private LocaleId targetLocale; private Set textUnitIntegrityCheckers = new HashSet<>(); public PseudoLocalizeStep(Asset asset) { + this(asset, SubstituteType.RANDOM); + } + + public PseudoLocalizeStep(Asset asset, SubstituteType substituteType) { this.asset = asset; + this.substituteType = substituteType; } @Autowired IntegrityCheckerFactory integrityCheckerFactory; @@ -81,7 +88,8 @@ protected Event handleTextUnit(Event event) { if (textUnit.isTranslatable()) { String source = textUnitUtils.getSourceAsString(textUnit); String pseudoTranslation = - pseudoLocalization.convertStringToPseudoLoc(source, textUnitIntegrityCheckers); + pseudoLocalization.convertStringToPseudoLoc( + source, textUnitIntegrityCheckers, substituteType); textUnit.setTarget(targetLocale, new TextContainer(pseudoTranslation)); } diff --git a/webapp/src/main/java/com/box/l10n/mojito/pseudoloc/PseudoLocalization.java b/webapp/src/main/java/com/box/l10n/mojito/pseudoloc/PseudoLocalization.java index 2cc9335835..6828f14d26 100644 --- a/webapp/src/main/java/com/box/l10n/mojito/pseudoloc/PseudoLocalization.java +++ b/webapp/src/main/java/com/box/l10n/mojito/pseudoloc/PseudoLocalization.java @@ -17,6 +17,14 @@ @Component public class PseudoLocalization { + /** Specifies how accent/diacritics characters that replace ASCII letters are chosen */ + public enum SubstituteType { + /** Replacement characters are picked at random. */ + RANDOM, + /** Replacement characters are picked consistently for a given string. */ + CONSISTENT + } + /** Logger */ static Logger logger = LoggerFactory.getLogger(PseudoLocalization.class); @@ -87,15 +95,21 @@ public class PseudoLocalization { * @return pseudo localized string */ public String convertStringToPseudoLoc(String string, Set checkers) { + return convertStringToPseudoLoc(string, checkers, SubstituteType.RANDOM); + } + + public String convertStringToPseudoLoc( + String string, Set checkers, SubstituteType substituteType) { TextUnitIntegrityChecker checker = getIntegrityCheckerForPlaceholderProcessing(checkers); if (checker == null) { logger.debug("There is no checker for pseudolocalization placeholder processing."); - return convertStringToPseudoLoc(string); + return convertStringToPseudoLoc(string, substituteType); } else { logger.debug("Found checker for pseudolocalization placeholder processing."); LocalizableString localizableString = checker.extractNonLocalizableParts(string); - String pseudolocalized = convertStringToPseudoLoc(localizableString.getLocalizableString()); + String pseudolocalized = + convertStringToPseudoLoc(localizableString.getLocalizableString(), substituteType); localizableString.setLocalizableString(pseudolocalized); return checker.restoreNonLocalizableParts(localizableString); } @@ -108,10 +122,14 @@ public String convertStringToPseudoLoc(String string, Set convertAsciiToDiacriticsRandom(string); + case CONSISTENT -> convertAsciiToDiacriticsConsistent(string); + }; } /** - * Get a non ASCII character mapping to provided character or the character itself if there is no - * mapping - * - * @param character ASCII character to be mapped - * @return Non ASCII character or character itself + * Converts ASCII letters in the whole string into equivalent characters with accent/diacritics, + * selecting mapped characters consistently. This will always return the same mapped string for a + * given input. */ - private char getMappingCharFromMap(char character) { - char mappedChar = character; + private String convertAsciiToDiacriticsConsistent(String string) { + StringBuilder builder = new StringBuilder(); + + // keeps track of which mapped char we used last time + Map lastMappedIdx = new HashMap<>(); - String mappingCharsForChar = pseudoLocMap.get(mappedChar); + for (char character : string.toCharArray()) { + String mappingsForChar = pseudoLocMap.get(character); - if (mappingCharsForChar != null) { - int maxIndex = mappingCharsForChar.length() - 1; - int randomIndex = (int) (Math.random() * maxIndex); - mappedChar = mappingCharsForChar.charAt(randomIndex); + if (mappingsForChar == null) { + // don't replace if no mapping available + builder.append(character); + continue; + } + + // pick next mapped char (or go back to the beginning if used all of them) + int mappedIdx = (1 + lastMappedIdx.getOrDefault(character, -1)) % mappingsForChar.length(); + lastMappedIdx.put(character, mappedIdx); + char mappedCharacter = mappingsForChar.charAt(mappedIdx); + + builder.append(mappedCharacter); } + return builder.toString(); + } + + /** + * Converts ASCII letters in the whole string into equivalent characters with accent/diacritics, + * selecting mapped characters at random. This will return different string every time, even if + * input does not change. + */ + private String convertAsciiToDiacriticsRandom(String string) { + int stringLength = string.length(); + StringBuilder sb = new StringBuilder(); + + for (int i = 0; i < stringLength; i++) { + char character = string.charAt(i); + String mappingCharsForChar = pseudoLocMap.get(character); - return mappedChar; + if (mappingCharsForChar != null) { + int maxIndex = mappingCharsForChar.length() - 1; + int randomIndex = (int) (Math.random() * maxIndex); + character = mappingCharsForChar.charAt(randomIndex); + } + + sb.append(character); + } + return sb.toString(); } /** diff --git a/webapp/src/main/java/com/box/l10n/mojito/rest/asset/AssetWS.java b/webapp/src/main/java/com/box/l10n/mojito/rest/asset/AssetWS.java index c9d1c5fc9d..8b3e857d1a 100644 --- a/webapp/src/main/java/com/box/l10n/mojito/rest/asset/AssetWS.java +++ b/webapp/src/main/java/com/box/l10n/mojito/rest/asset/AssetWS.java @@ -11,6 +11,7 @@ import com.box.l10n.mojito.entity.TMXliff; import com.box.l10n.mojito.json.ObjectMapper; import com.box.l10n.mojito.okapi.asset.UnsupportedAssetFilterTypeException; +import com.box.l10n.mojito.pseudoloc.PseudoLocalization; import com.box.l10n.mojito.quartz.QuartzJobInfo; import com.box.l10n.mojito.quartz.QuartzPollableTaskScheduler; import com.box.l10n.mojito.rest.View; @@ -306,9 +307,17 @@ public LocalizedAssetBody getPseudoLocalizedAssetForContent( Asset asset = assetRepository.getOne(assetId); String normalizedContent = NormalizationUtils.normalize(localizedAssetBody.getContent()); + PseudoLocalization.SubstituteType substituteType = + localizedAssetBody.getSubstituteType() != null + ? localizedAssetBody.getSubstituteType() + : PseudoLocalization.SubstituteType.RANDOM; + String generateLocalized = tmService.generatePseudoLocalized( - asset, normalizedContent, localizedAssetBody.getFilterConfigIdOverride()); + asset, + normalizedContent, + localizedAssetBody.getFilterConfigIdOverride(), + substituteType); localizedAssetBody.setContent(generateLocalized); diff --git a/webapp/src/main/java/com/box/l10n/mojito/rest/asset/LocalizedAssetBody.java b/webapp/src/main/java/com/box/l10n/mojito/rest/asset/LocalizedAssetBody.java index 7e79958b14..7eb0b97a22 100644 --- a/webapp/src/main/java/com/box/l10n/mojito/rest/asset/LocalizedAssetBody.java +++ b/webapp/src/main/java/com/box/l10n/mojito/rest/asset/LocalizedAssetBody.java @@ -3,6 +3,7 @@ import com.box.l10n.mojito.okapi.FilterConfigIdOverride; import com.box.l10n.mojito.okapi.InheritanceMode; import com.box.l10n.mojito.okapi.Status; +import com.box.l10n.mojito.pseudoloc.PseudoLocalization; import java.util.List; /** @@ -50,6 +51,8 @@ public class LocalizedAssetBody { Status status = Status.ALL; + PseudoLocalization.SubstituteType substituteType; + public LocalizedAssetBody() {} public LocalizedAssetBody(String bcp47Tag, String content) { @@ -136,4 +139,12 @@ public String getPullRunName() { public void setPullRunName(String pullRunName) { this.pullRunName = pullRunName; } + + public PseudoLocalization.SubstituteType getSubstituteType() { + return substituteType; + } + + public void setSubstituteType(PseudoLocalization.SubstituteType substituteType) { + this.substituteType = substituteType; + } } diff --git a/webapp/src/main/java/com/box/l10n/mojito/service/tm/TMService.java b/webapp/src/main/java/com/box/l10n/mojito/service/tm/TMService.java index a5b830075a..1243eb844a 100644 --- a/webapp/src/main/java/com/box/l10n/mojito/service/tm/TMService.java +++ b/webapp/src/main/java/com/box/l10n/mojito/service/tm/TMService.java @@ -41,6 +41,7 @@ import com.box.l10n.mojito.okapi.qualitycheck.QualityCheckStep; import com.box.l10n.mojito.okapi.steps.CheckForDoNotTranslateStep; import com.box.l10n.mojito.okapi.steps.FilterEventsToInMemoryRawDocumentStep; +import com.box.l10n.mojito.pseudoloc.PseudoLocalization; import com.box.l10n.mojito.quartz.QuartzJobInfo; import com.box.l10n.mojito.quartz.QuartzPollableTaskScheduler; import com.box.l10n.mojito.retry.DataIntegrityViolationExceptionRetryTemplate; @@ -1098,10 +1099,21 @@ void replaceUsedTmTextUnitVariantIds( public String generatePseudoLocalized( Asset asset, String content, FilterConfigIdOverride filterConfigIdOverride) throws UnsupportedAssetFilterTypeException { + return generatePseudoLocalized( + asset, content, filterConfigIdOverride, PseudoLocalization.SubstituteType.RANDOM); + } + + public String generatePseudoLocalized( + Asset asset, + String content, + FilterConfigIdOverride filterConfigIdOverride, + PseudoLocalization.SubstituteType substituteType) + throws UnsupportedAssetFilterTypeException { String bcp47tag = "en-x-psaccent"; - BasePipelineStep pseudoLocalizedStep = (BasePipelineStep) new PseudoLocalizeStep(asset); + BasePipelineStep pseudoLocalizedStep = + (BasePipelineStep) new PseudoLocalizeStep(asset, substituteType); return generateLocalizedBase( asset, content, filterConfigIdOverride, null, pseudoLocalizedStep, bcp47tag); } diff --git a/webapp/src/test/java/com/box/l10n/mojito/pseudoloc/PseudoLocalizationTest.java b/webapp/src/test/java/com/box/l10n/mojito/pseudoloc/PseudoLocalizationTest.java index 70b370b5b8..beeee5be94 100644 --- a/webapp/src/test/java/com/box/l10n/mojito/pseudoloc/PseudoLocalizationTest.java +++ b/webapp/src/test/java/com/box/l10n/mojito/pseudoloc/PseudoLocalizationTest.java @@ -2,6 +2,7 @@ import static org.junit.Assert.*; +import com.box.l10n.mojito.pseudoloc.PseudoLocalization.SubstituteType; import com.box.l10n.mojito.service.assetintegritychecker.integritychecker.MessageFormatIntegrityChecker; import com.box.l10n.mojito.service.assetintegritychecker.integritychecker.TextUnitIntegrityChecker; import com.box.l10n.mojito.service.assetintegritychecker.integritychecker.WhitespaceIntegrityChecker; @@ -27,6 +28,15 @@ public void testStringIsConvertedToDiacritics() { "The string should be converted to diacritics", "English Sentence", diacriticsString); } + @Test + public void testStringIsConvertedToDiacriticsRandom() { + PseudoLocalization ps = new PseudoLocalization(); + String first = ps.convertAsciiToDiacritics("English Sentence"); + String second = ps.convertAsciiToDiacritics("English Sentence"); + assertNotEquals( + "Default substitution should randomly select diacritics across calls", first, second); + } + @Test public void testStringIsNotConvertedToDiacritics() { // The chars q, Q, and V are not converted so they should not be converted @@ -43,6 +53,15 @@ public void testconvertStringToPseudoLoc() { assertNotEquals("The string should be pseudolocalized", "English Sentence", pseudoLocalized); } + @Test + public void testconvertStringToPseudoLocRandom() { + PseudoLocalization ps = new PseudoLocalization(); + String first = ps.convertStringToPseudoLoc("English Sentence"); + String second = ps.convertStringToPseudoLoc("English Sentence"); + assertNotEquals( + "Default pseudolocalization should substitute diacritics randomly", first, second); + } + @Test public void testconvertStringToPseudoLocDoesNot() { PseudoLocalization ps = new PseudoLocalization(); @@ -149,4 +168,50 @@ public void testConvertPluralMessageFormatStringToPseudoLoc3() { "The plural text variation should be pseudolocalized while the placeholder should not", pseudoLocalized.contains("{# Comments or Tasks}")); } + + @Test + public void testConsistentSubstitutionIsDeterministic() { + PseudoLocalization ps = new PseudoLocalization(); + String first = ps.convertAsciiToDiacritics("Hello World", SubstituteType.CONSISTENT); + String second = ps.convertAsciiToDiacritics("Hello World", SubstituteType.CONSISTENT); + assertEquals("Consistent substitution should produce identical results", first, second); + } + + @Test + public void testConsistentSubstitutionProducesDiacritics() { + PseudoLocalization ps = new PseudoLocalization(); + String result = ps.convertAsciiToDiacritics("Hello", SubstituteType.CONSISTENT); + assertNotEquals("Consistent substitution should still transform the string", "Hello", result); + } + + @Test + public void testConsistentConvertStringToPseudoLoc() { + PseudoLocalization ps = new PseudoLocalization(); + String first = ps.convertStringToPseudoLoc("English Sentence", SubstituteType.CONSISTENT); + String second = ps.convertStringToPseudoLoc("English Sentence", SubstituteType.CONSISTENT); + assertEquals( + "Consistent pseudolocalization should be deterministic across calls", first, second); + } + + @Test + public void testConsistentConvertStringToPseudoLocWithCheckers() { + PseudoLocalization ps = new PseudoLocalization(); + Set checkers = new HashSet<>(); + checkers.add(new MessageFormatIntegrityChecker()); + String first = + ps.convertStringToPseudoLoc("Hello {name}, welcome!", checkers, SubstituteType.CONSISTENT); + String second = + ps.convertStringToPseudoLoc("Hello {name}, welcome!", checkers, SubstituteType.CONSISTENT); + assertEquals("Consistent substitution with checkers should be deterministic", first, second); + assertTrue( + "Placeholders should be preserved with consistent substitution", first.contains("{name}")); + } + + @Test + public void testConsistentDoesNotConvertUnmappedChars() { + PseudoLocalization ps = new PseudoLocalization(); + String result = ps.convertAsciiToDiacritics("qQV", SubstituteType.CONSISTENT); + assertEquals( + "Unmapped chars should remain unchanged with consistent substitution", "qQV", result); + } }