From c5372452089b90505a25b9bc6d85c8e3a28a506a Mon Sep 17 00:00:00 2001 From: Mauro Cassani Date: Mon, 25 Nov 2024 16:39:14 +0100 Subject: [PATCH 1/2] Preserve spaces between in the --- src/XliffReplacer/AbstractXliffReplacer.php | 26 ++++++++ src/XliffReplacer/Xliff12.php | 13 ++++ tests/XliffReplacerTest.php | 73 +++++++++++++++++++-- tests/files/mrk-with-space.xliff | 39 +++++++++++ 4 files changed, 145 insertions(+), 6 deletions(-) create mode 100644 tests/files/mrk-with-space.xliff diff --git a/src/XliffReplacer/AbstractXliffReplacer.php b/src/XliffReplacer/AbstractXliffReplacer.php index 2688eba..6c446a1 100644 --- a/src/XliffReplacer/AbstractXliffReplacer.php +++ b/src/XliffReplacer/AbstractXliffReplacer.php @@ -54,6 +54,8 @@ abstract class AbstractXliffReplacer { 'eq_word_count' => 0, ]; + protected $mrkTagsMap = []; + /** * AbstractXliffReplacer constructor. * @@ -303,6 +305,30 @@ protected function postProcAndFlush( $fp, string $data, bool $treatAsCDATA = fal //postprocess string $data = preg_replace( "/" . self::$INTERNAL_TAG_PLACEHOLDER . '(.*?)' . self::$INTERNAL_TAG_PLACEHOLDER . "/", '&$1;', $data ); $data = str_replace( ' ', ' ', $data ); + + // check if there are spaces between tags + preg_match_all('/]*>(.*?)<\/mrk>(\s+)/', $data, $spacesBetweenMrkCheck); + + if(!empty($spacesBetweenMrkCheck[0])){ + + // $spacesBetweenMrkCheck[0] // holds the complete tags + // $spacesBetweenMrkCheck[1] // holds the text + // $spacesBetweenMrkCheck[2] // holds the spaces + + foreach ($spacesBetweenMrkCheck[0] as $index => $mrk){ + preg_match('/mid="(\d+)"/', $mrk, $markMatch); + + if(isset($markMatch[1])){ + + if(!isset($this->mrkTagsMap[$this->currentTransUnitId])){ + $this->mrkTagsMap[$this->currentTransUnitId] = []; + } + + $this->mrkTagsMap[$this->currentTransUnitId][$markMatch[1]] = $spacesBetweenMrkCheck[2][$index]; + } + } + } + if ( !$treatAsCDATA ) { //unix2dos $data = str_replace( "\r\n", "\r", $data ); diff --git a/src/XliffReplacer/Xliff12.php b/src/XliffReplacer/Xliff12.php index 79f5e18..24970c7 100644 --- a/src/XliffReplacer/Xliff12.php +++ b/src/XliffReplacer/Xliff12.php @@ -228,10 +228,23 @@ protected function prepareTranslation( array $seg, string $transUnitTranslation return $transUnitTranslation; } + /** + * @param array $seg + * @param string $translation + * @return string + */ protected function rebuildMarks( array $seg, string $translation ): string { if ( $seg[ 'mrk_id' ] !== null && $seg[ 'mrk_id' ] != '' ) { $translation = "" . $seg[ 'mrk_prev_tags' ] . $translation . $seg[ 'mrk_succ_tags' ] . ""; + + // check if there is a trailing space in the map of this trans unit + if( + isset($this->mrkTagsMap[$this->currentTransUnitId]) and + isset($this->mrkTagsMap[$this->currentTransUnitId][$seg[ 'mrk_id' ]]) + ){ + $translation .= $this->mrkTagsMap[$this->currentTransUnitId][$seg[ 'mrk_id' ]]; + } } return $translation; diff --git a/tests/XliffReplacerTest.php b/tests/XliffReplacerTest.php index 01bf25f..74eb80d 100644 --- a/tests/XliffReplacerTest.php +++ b/tests/XliffReplacerTest.php @@ -1210,6 +1210,13 @@ public function should_replace_12_units_with_empty_segments_with_the_correct_sta $this->assertEquals( $status, $output[ 'files' ][ 1 ][ 'trans-units' ][ 3 ][ 'target' ][ 'attr' ][ 'state' ] ); } + /** + * @test + */ + public function should_replace_12_with_mrk_space() { + + } + /** * @test */ @@ -1218,14 +1225,68 @@ public function should_replace_12_units_with_entities() { $data = $this->getData( [ [ 'sid' => '1', - 'segment' => 'Hello'' ', - 'internal_id' => '2973331', + 'segment' => 'No more digging. If it’s top of mind, you’ll find it at the top of your inbox.', + 'internal_id' => '2975931', 'mrk_id' => '0', 'prev_tags' => '', 'succ_tags' => '', 'mrk_prev_tags' => NULL, 'mrk_succ_tags' => NULL, - 'translation' => 'Ciao'' ', + 'translation' => 'This is the translation', + 'status' => 'APPROVED', + 'error' => '', + 'eq_word_count' => '1.34', + 'raw_word_count' => '2.00', + 'source_page' => NULL, + 'r2' => NULL, + 'data_ref_map' => NULL, + ], + [ + 'sid' => '1', + 'segment' => 'No more digging. If it’s top of mind, you’ll find it at the top of your inbox.', + 'internal_id' => '2975931', + 'mrk_id' => '1', + 'prev_tags' => '', + 'succ_tags' => '', + 'mrk_prev_tags' => NULL, + 'mrk_succ_tags' => NULL, + 'translation' => 'This is the second part of the translation', + 'status' => 'APPROVED', + 'error' => '', + 'eq_word_count' => '1.34', + 'raw_word_count' => '2.00', + 'source_page' => NULL, + 'r2' => NULL, + 'data_ref_map' => NULL, + ], + [ + 'sid' => '2', + 'segment' => 'Your messages are all here with a new look and feel. See how your mailbox uses AI to save you time.', + 'internal_id' => '2976344', + 'mrk_id' => '0', + 'prev_tags' => '', + 'succ_tags' => '', + 'mrk_prev_tags' => NULL, + 'mrk_succ_tags' => NULL, + 'translation' => 'This is the translation', + 'status' => 'APPROVED', + 'error' => '', + 'eq_word_count' => '1.34', + 'raw_word_count' => '2.00', + 'source_page' => NULL, + 'r2' => NULL, + 'data_ref_map' => NULL, + ], + [ + 'sid' => '2', + 'segment' => 'Your messages are all here with a new look and feel. See how your mailbox uses AI to save you time.', + 'internal_id' => '2976344', + 'mrk_id' => '1', + 'prev_tags' => '', + 'succ_tags' => '', + 'mrk_prev_tags' => NULL, + 'mrk_succ_tags' => NULL, + 'translation' => 'This is the second part of the translation', 'status' => 'APPROVED', 'error' => '', 'eq_word_count' => '1.34', @@ -1236,13 +1297,13 @@ public function should_replace_12_units_with_entities() { ], ] ); - $inputFile = __DIR__ . '/../tests/files/with-entities.xliff'; - $outputFile = __DIR__ . '/../tests/files/output/with-entities.xliff'; + $inputFile = __DIR__ . '/../tests/files/mrk-with-space.xliff'; + $outputFile = __DIR__ . '/../tests/files/output/mrk-with-space.xliff'; ( new XliffParser() )->replaceTranslation( $inputFile, $data[ 'data' ], $data[ 'transUnits' ], 'it-it', $outputFile, false ); $output = ( new XliffParser() )->xliffToArray( file_get_contents( $outputFile ) ); - $this->assertEquals( "Ciao'' ", $output[ 'files' ][ 1 ][ 'trans-units' ][ 1 ][ 'target' ][ 'raw-content' ] ); + $this->assertEquals( "This is the translation This is the second part of the translation", $output[ 'files' ][ 1 ][ 'trans-units' ][ 1 ][ 'target' ][ 'raw-content' ] ); } /** diff --git a/tests/files/mrk-with-space.xliff b/tests/files/mrk-with-space.xliff new file mode 100644 index 0000000..3ecb6fe --- /dev/null +++ b/tests/files/mrk-with-space.xliff @@ -0,0 +1,39 @@ + + +
+ + 2 + 40 + null + null + null + null + 40 + +
+ + + No more digging. If it’s top of mind, you’ll find it at the top of your inbox. + No more digging. If it’s top of mind, you’ll find it at the top of your inbox. + No more digging. If it’s top of mind, you’ll find it at the top of your inbox. + Reported in https://ouryahoo.atlassian.net/browse/MDF-3156, missing a space before/after a period.Description for onboarding ai features modal priority stepNo more digging. If it’s top of mind, you’ll find it at the top of your inbox. + + + + + + + + Your messages are all here with a new look and feel. See how your mailbox uses AI to save you time. + Your messages are all here with a new look and feel. See how your mailbox uses AI to save you time. + Your messages are all here with a new look and feel. See how your mailbox uses AI to save you time. + Reported in https://ouryahoo.atlassian.net/browse/MDF-3156, missing a space before/after a period.Onboarding AI features card bodyYour messages are all here with a new look and feel. See how your mailbox uses AI to save you time. + + + + + + + +
+
\ No newline at end of file From fb6557f7b76bd8abedfc32eddab9630372578515 Mon Sep 17 00:00:00 2001 From: Mauro Cassani Date: Mon, 25 Nov 2024 17:04:01 +0100 Subject: [PATCH 2/2] Preserve spaces between tags in the translation --- src/XliffReplacer/AbstractXliffReplacer.php | 41 +++++++++++++-------- src/XliffReplacer/Xliff12.php | 4 +- src/XliffReplacer/Xliff20.php | 4 +- 3 files changed, 29 insertions(+), 20 deletions(-) diff --git a/src/XliffReplacer/AbstractXliffReplacer.php b/src/XliffReplacer/AbstractXliffReplacer.php index 6c446a1..a5ba0ff 100644 --- a/src/XliffReplacer/AbstractXliffReplacer.php +++ b/src/XliffReplacer/AbstractXliffReplacer.php @@ -298,33 +298,42 @@ protected function characterData( $parser, string $data ): void { * postprocess escaped data and write to disk * * @param resource $fp - * @param string $data - * @param bool $treatAsCDATA + * @param string $data + * @param bool $treatAsCDATA + * @param bool $parseMarks */ - protected function postProcAndFlush( $fp, string $data, bool $treatAsCDATA = false ) { + protected function postProcAndFlush($fp, string $data, bool $treatAsCDATA = false, $parseMarks = false ) { //postprocess string $data = preg_replace( "/" . self::$INTERNAL_TAG_PLACEHOLDER . '(.*?)' . self::$INTERNAL_TAG_PLACEHOLDER . "/", '&$1;', $data ); $data = str_replace( ' ', ' ', $data ); - // check if there are spaces between tags - preg_match_all('/]*>(.*?)<\/mrk>(\s+)/', $data, $spacesBetweenMrkCheck); - - if(!empty($spacesBetweenMrkCheck[0])){ + // extract map only for tag + if($parseMarks){ + // check if there are spaces between tags + preg_match_all('/]*>(.*?)<\/mrk>(\s+)/', $data, $spacesBetweenMrkCheck); - // $spacesBetweenMrkCheck[0] // holds the complete tags - // $spacesBetweenMrkCheck[1] // holds the text - // $spacesBetweenMrkCheck[2] // holds the spaces + if(!empty($spacesBetweenMrkCheck[0])){ - foreach ($spacesBetweenMrkCheck[0] as $index => $mrk){ - preg_match('/mid="(\d+)"/', $mrk, $markMatch); + // $spacesBetweenMrkCheck[0] // holds the complete tags + // $spacesBetweenMrkCheck[1] // holds the text + // $spacesBetweenMrkCheck[2] // holds the spaces - if(isset($markMatch[1])){ + foreach ($spacesBetweenMrkCheck[0] as $index => $mrk){ - if(!isset($this->mrkTagsMap[$this->currentTransUnitId])){ - $this->mrkTagsMap[$this->currentTransUnitId] = []; + if($this instanceof Xliff20){ + preg_match('/id="(\d+)"/', $mrk, $markMatch); + } else { + preg_match('/mid="(\d+)"/', $mrk, $markMatch); } - $this->mrkTagsMap[$this->currentTransUnitId][$markMatch[1]] = $spacesBetweenMrkCheck[2][$index]; + if(isset($markMatch[1])){ + + if(!isset($this->mrkTagsMap[$this->currentTransUnitId])){ + $this->mrkTagsMap[$this->currentTransUnitId] = []; + } + + $this->mrkTagsMap[$this->currentTransUnitId][$markMatch[1]] = $spacesBetweenMrkCheck[2][$index]; + } } } } diff --git a/src/XliffReplacer/Xliff12.php b/src/XliffReplacer/Xliff12.php index 24970c7..765cc65 100644 --- a/src/XliffReplacer/Xliff12.php +++ b/src/XliffReplacer/Xliff12.php @@ -142,7 +142,7 @@ protected function tagClose( $parser, string $name ) { $this->CDATABuffer = ""; //flush to the pointer - $this->postProcAndFlush( $this->outputFP, $tag ); + $this->postProcAndFlush( $this->outputFP, $tag, false, $name === 'seg-source' ); } elseif ( $name === $this->tuTagName ) { @@ -179,7 +179,7 @@ protected function tagClose( $parser, string $name ) { $this->CDATABuffer = ""; //flush to the pointer - $this->postProcAndFlush( $this->outputFP, $tag ); + $this->postProcAndFlush( $this->outputFP, $tag, false, $name === 'seg-source' ); } else { //ok, nothing to be done; reset flag for next coming tag diff --git a/src/XliffReplacer/Xliff20.php b/src/XliffReplacer/Xliff20.php index aea66ff..665783b 100644 --- a/src/XliffReplacer/Xliff20.php +++ b/src/XliffReplacer/Xliff20.php @@ -233,7 +233,7 @@ protected function tagClose( $parser, string $name ) { $this->CDATABuffer = ""; //flush to the pointer - $this->postProcAndFlush( $this->outputFP, $tag ); + $this->postProcAndFlush( $this->outputFP, $tag, false, $name === 'source' ); } elseif ( 'segment' === $name ) { @@ -278,7 +278,7 @@ protected function tagClose( $parser, string $name ) { $this->CDATABuffer = ""; //flush to the pointer - $this->postProcAndFlush( $this->outputFP, $tag ); + $this->postProcAndFlush( $this->outputFP, $tag, false, $name === 'source' ); } else { //ok, nothing to be done; reset flag for next coming tag