diff --git a/src/BloomExe/Publish/BloomPub/BloomPubMaker.cs b/src/BloomExe/Publish/BloomPub/BloomPubMaker.cs index edca1623a03e..b8deb49d9bf5 100644 --- a/src/BloomExe/Publish/BloomPub/BloomPubMaker.cs +++ b/src/BloomExe/Publish/BloomPub/BloomPubMaker.cs @@ -170,6 +170,9 @@ public static string CreateBloomPub( .Cast(), modifiedBook.FolderPath ); + // BL-15848 Deduplication to save space. After publish-time media transforms so we compare the actual bloompub payload, + // not source files that may still be resized, trimmed, or rewritten later in the pipeline. + PublishHelper.DeDuplicateMediaFiles(modifiedBook.RawDom, modifiedBook.FolderPath); var newContent = XmlHtmlConverter.ConvertDomToHtml5(modifiedBook.RawDom); var originalBookHtmlPath = BookStorage.FindBookHtmlInFolder(modifiedBook.FolderPath); diff --git a/src/BloomExe/Publish/PublishHelper.cs b/src/BloomExe/Publish/PublishHelper.cs index e376e9b57ae5..0e04818ed20b 100644 --- a/src/BloomExe/Publish/PublishHelper.cs +++ b/src/BloomExe/Publish/PublishHelper.cs @@ -4,6 +4,7 @@ using System.Drawing; using System.IO; using System.Linq; +using System.Security.Cryptography; using System.Text; using System.Threading; using System.Threading.Tasks; @@ -1084,6 +1085,322 @@ public static Book.Book MakeDeviceXmatterTempBook( return modifiedBook; } + private sealed class MediaReference + { + public string RelativePath; + public Action RewriteReference; + } + + internal static void DeDuplicateMediaFiles(SafeXmlDocument dom, string folderPath) + { + DeDuplicateReferencedMedia(GetImageMediaReferences(dom), folderPath); + DeDuplicateReferencedMedia(GetVideoMediaReferences(dom), folderPath); + // Narration files are tied to specific spans, so keep those one-to-one file names stable. + var talkingBookAudioFileNames = GetTalkingBookAudioFileNames(dom); + DeDuplicateReferencedMedia( + GetNonTalkingAudioMediaReferences(dom, talkingBookAudioFileNames), + folderPath + ); + } + + private static void DeDuplicateReferencedMedia( + IEnumerable references, + string folderPath + ) + { + var referencesByPath = new Dictionary>(); + var firstRelativePathByNormalizedPath = new Dictionary(); + var fullPathByNormalizedPath = new Dictionary(); + var normalizedPathsInEncounterOrder = new List(); + + foreach (var reference in references) + { + if (string.IsNullOrWhiteSpace(reference.RelativePath)) + continue; + + var relativePath = NormalizeSlashes(reference.RelativePath); + var caseNormalizedRelativePath = BookStorage.GetNormalizedPathForOS(relativePath); // for comparison + if (!referencesByPath.TryGetValue(caseNormalizedRelativePath, out var refsForPath)) + { + refsForPath = new List(); + referencesByPath[caseNormalizedRelativePath] = refsForPath; + firstRelativePathByNormalizedPath[caseNormalizedRelativePath] = relativePath; + fullPathByNormalizedPath[caseNormalizedRelativePath] = ResolveMediaFilePath( + folderPath, + relativePath + ); + normalizedPathsInEncounterOrder.Add(caseNormalizedRelativePath); + } + + refsForPath.Add(reference); + } + + var hashToCanonicalRelativePath = new Dictionary(); + var normalizedPathsToDelete = new HashSet(); + foreach (var normalizedRelativePath in normalizedPathsInEncounterOrder) + { + var filePath = fullPathByNormalizedPath[normalizedRelativePath]; + if (!RobustFile.Exists(filePath)) + continue; + + var hashString = ComputeFileHash(filePath); + if ( + hashToCanonicalRelativePath.TryGetValue( + hashString, + out var canonicalRelativePath + ) + ) + { + // Rewrite all references before deleting any duplicate file so later references to + // the same duplicate path can still be resolved during this pass. + foreach (var reference in referencesByPath[normalizedRelativePath]) + { + reference.RewriteReference(canonicalRelativePath); + } + + normalizedPathsToDelete.Add(normalizedRelativePath); + } + else + { + hashToCanonicalRelativePath[hashString] = firstRelativePathByNormalizedPath[ + normalizedRelativePath + ]; + } + } + + foreach (var normalizedRelativePath in normalizedPathsToDelete) + { + RobustFile.Delete(fullPathByNormalizedPath[normalizedRelativePath]); + } + } + + private static IEnumerable GetImageMediaReferences(SafeXmlDocument dom) + { + foreach ( + var imageElement in HtmlDom + .SelectChildImgAndBackgroundImageElements(dom.DocumentElement) + .Cast() + ) + { + var relativePath = HtmlDom.GetImageElementUrl(imageElement).PathOnly.NotEncoded; + if ( + string.IsNullOrWhiteSpace(relativePath) + || ImageUtils.IsPlaceholderImageFilename(relativePath) + ) + { + continue; + } + + yield return new MediaReference + { + RelativePath = relativePath, + RewriteReference = canonicalRelativePath => + HtmlDom.SetImageElementUrl( + imageElement, + UrlPathString.CreateFromUnencodedString(canonicalRelativePath) + ), + }; + } + + foreach ( + var bookSetting in dom.SafeSelectNodes( + "//div[@id='bloomDataDiv']/div[@data-book='coverImage' or @data-book='licenseImage']" + ) + .Cast() + ) + { + var relativePath = UrlPathString + .CreateFromUrlEncodedString(bookSetting.InnerText.Trim()) + .PathOnly.NotEncoded; + if ( + string.IsNullOrWhiteSpace(relativePath) + || ImageUtils.IsPlaceholderImageFilename(relativePath) + ) + { + continue; + } + + yield return new MediaReference + { + RelativePath = relativePath, + RewriteReference = canonicalRelativePath => + { + bookSetting.InnerText = canonicalRelativePath; + if (bookSetting.GetAttribute("data-book") == "coverImage") + { + bookSetting.SetAttribute("src", canonicalRelativePath); + } + }, + }; + } + } + + private static IEnumerable GetVideoMediaReferences(SafeXmlDocument dom) + { + foreach ( + var videoContainer in HtmlDom + .SelectChildVideoElements(dom.DocumentElement) + .Cast() + ) + { + var relativePath = HtmlDom.GetVideoElementUrl(videoContainer).PathOnly.NotEncoded; + if (string.IsNullOrWhiteSpace(relativePath)) + continue; + + yield return new MediaReference + { + RelativePath = relativePath, + RewriteReference = canonicalRelativePath => + HtmlDom.SetVideoElementUrl( + videoContainer, + UrlPathString.CreateFromUnencodedString(canonicalRelativePath) + ), + }; + } + } + + private static IEnumerable GetNonTalkingAudioMediaReferences( + SafeXmlDocument dom, + HashSet talkingBookAudioFileNames + ) + { + foreach ( + var pageWithBackgroundMusic in HtmlDom + .SelectChildBackgroundMusicElements(dom.DocumentElement) + .Cast() + ) + { + var reference = MakeAudioAttributeReference( + pageWithBackgroundMusic, + HtmlDom.musicAttrName, + talkingBookAudioFileNames + ); + if (reference != null) + yield return reference; + } + + foreach ( + var soundElement in dom.SafeSelectNodes(".//*[@data-sound]").Cast() + ) + { + var reference = MakeAudioAttributeReference( + soundElement, + "data-sound", + talkingBookAudioFileNames + ); + if (reference != null) + yield return reference; + } + + foreach ( + var elementWithCorrectSound in dom.SafeSelectNodes(".//*[@data-correct-sound]") + .Cast() + ) + { + var reference = MakeAudioAttributeReference( + elementWithCorrectSound, + "data-correct-sound", + talkingBookAudioFileNames + ); + if (reference != null) + yield return reference; + } + + foreach ( + var elementWithWrongSound in dom.SafeSelectNodes(".//*[@data-wrong-sound]") + .Cast() + ) + { + var reference = MakeAudioAttributeReference( + elementWithWrongSound, + "data-wrong-sound", + talkingBookAudioFileNames + ); + if (reference != null) + yield return reference; + } + } + + private static MediaReference MakeAudioAttributeReference( + SafeXmlElement element, + string attributeName, + HashSet talkingBookAudioFileNames + ) + { + var rawValue = element.GetAttribute(attributeName); + if (string.IsNullOrWhiteSpace(rawValue) || rawValue == "none") + return null; + + var fileName = UrlPathString.CreateFromUrlEncodedString(rawValue).PathOnly.NotEncoded; + var normalizedFileName = BookStorage.GetNormalizedPathForOS(fileName); + if (talkingBookAudioFileNames.Contains(normalizedFileName)) + return null; + + return new MediaReference + { + RelativePath = MakeRelativePath("audio", fileName), + RewriteReference = canonicalRelativePath => + element.SetAttribute(attributeName, Path.GetFileName(canonicalRelativePath)), + }; + } + + private static HashSet GetTalkingBookAudioFileNames(SafeXmlDocument dom) + { + // Match Bloom's narration selector so we skip exactly the files publish already treats as + // talking-book audio, including split TextBox recordings. + var fileNames = new HashSet(); + foreach ( + var narrationElement in HtmlDom + .SelectChildNarrationAudioElements( + dom.DocumentElement, + includeSplitTextBoxAudio: true, + langsToExclude: null + ) + .Cast() + ) + { + var narrationId = narrationElement.GetOptionalStringAttribute("id", null); + if (string.IsNullOrWhiteSpace(narrationId)) + continue; + + foreach (var fileName in BookStorage.GetNarrationAudioFileNames(narrationId, true)) + { + fileNames.Add(BookStorage.GetNormalizedPathForOS(fileName)); + } + } + + return fileNames; + } + + private static string ComputeFileHash(string filePath) + { + using (var stream = RobustFile.OpenRead(filePath)) + { + return Convert.ToHexString(SHA256.HashData(stream)); + } + } + + private static string ResolveMediaFilePath(string folderPath, string relativePath) + { + var path = relativePath.Replace('/', Path.DirectorySeparatorChar); + return BookStorage.GetNormalizedPathForOS(Path.Combine(folderPath, path)); + } + + private static string NormalizeSlashes(string relativePath) + { + return relativePath.Replace('\\', '/'); + } + + private static string MakeRelativePath(params string[] parts) + { + return string.Join( + "/", + parts + .Where(part => !string.IsNullOrWhiteSpace(part)) + .Select(part => part.Trim('/', '\\')) + ); + } + /// /// Once we have really cropped any images that need it, we no longer need the "canvas element-like" /// HTML structure that supports cropping background images. So we get rid of the extra structure diff --git a/src/BloomTests/Publish/BloomPub/BloomPubMakerTests.cs b/src/BloomTests/Publish/BloomPub/BloomPubMakerTests.cs index 9e9c61e17a7c..47816a9d3865 100644 --- a/src/BloomTests/Publish/BloomPub/BloomPubMakerTests.cs +++ b/src/BloomTests/Publish/BloomPub/BloomPubMakerTests.cs @@ -1228,6 +1228,265 @@ public void CompressBookForDevice_NotBloomEnterprise_DoesNotRemoveVideoPages() ); } + [Test] + public void CompressBookForDevice_DeduplicatesDuplicateImages() + { + const string bodyContent = + @" +
+
+ first + second +
+
+"; + + TestHtmlAfterCompression( + bodyContent, + bookHeadContent: kMinimumValidBookHeadContent, + actionsOnFolderBeforeCompressing: folderPath => + { + File.Copy( + FileLocationUtilities.GetFileDistributedWithApplication( + _pathToTestImages, + "shirt.png" + ), + Path.Combine(folderPath, "duplicate-a.png") + ); + File.Copy( + FileLocationUtilities.GetFileDistributedWithApplication( + _pathToTestImages, + "shirt.png" + ), + Path.Combine(folderPath, "duplicate-b.png") + ); + }, + assertionsOnZipArchive: paramObj => + { + var zip = paramObj.ZipFile; + Assert.AreNotEqual(-1, zip.FindEntry("duplicate-a.png", false)); + Assert.AreEqual(-1, zip.FindEntry("duplicate-b.png", false)); + }, + assertionsOnResultingHtmlString: html => + { + var htmlDom = XmlHtmlConverter.GetXmlDomFromHtml(html); + AssertThatXmlIn + .Dom(htmlDom) + .HasSpecifiedNumberOfMatchesForXpath("//img[@src='duplicate-a.png']", 2); + AssertThatXmlIn + .Dom(htmlDom) + .HasNoMatchForXpath("//img[@src='duplicate-b.png']"); + } + ); + } + + [Test] + public void DeDuplicateMediaFiles_UpdatesCoverImageDataDivSrcAttribute() + { + var dom = SafeXmlDocument.Create(); + dom.LoadXml( + @" + + +
+
duplicate-b.png
+
+
+
+ first +
+
+ +" + ); + + using (var folder = new TemporaryFolder("DeDuplicateMediaFiles_CoverImageSrc")) + { + File.Copy( + FileLocationUtilities.GetFileDistributedWithApplication( + _pathToTestImages, + "shirt.png" + ), + Path.Combine(folder.Path, "duplicate-a.png") + ); + File.Copy( + FileLocationUtilities.GetFileDistributedWithApplication( + _pathToTestImages, + "shirt.png" + ), + Path.Combine(folder.Path, "duplicate-b.png") + ); + + PublishHelper.DeDuplicateMediaFiles(dom, folder.Path); + + AssertThatXmlIn + .Dom(dom) + .HasSpecifiedNumberOfMatchesForXpath( + "//div[@id='bloomDataDiv']/div[@data-book='coverImage' and @src='duplicate-a.png' and text()='duplicate-a.png']", + 1 + ); + Assert.That( + RobustFile.Exists(Path.Combine(folder.Path, "duplicate-a.png")), + Is.True + ); + Assert.That( + RobustFile.Exists(Path.Combine(folder.Path, "duplicate-b.png")), + Is.False + ); + } + } + + [Test] + public void CompressBookForDevice_DeduplicatesDuplicateVideos() + { + const string bodyContent = + @" +
+
+
+
+
+
+"; + + TestHtmlAfterCompression( + bodyContent, + bookHeadContent: kMinimumValidBookHeadContent, + actionsOnFolderBeforeCompressing: folderPath => + { + var videoFolderPath = Path.Combine(folderPath, "video"); + Directory.CreateDirectory(videoFolderPath); + RobustFile.Copy( + FileLocationUtilities.GetFileDistributedWithApplication( + kPathToTestVideos, + "Five count.mp4" + ), + Path.Combine(videoFolderPath, "DuplicateA.mp4") + ); + RobustFile.Copy( + FileLocationUtilities.GetFileDistributedWithApplication( + kPathToTestVideos, + "Five count.mp4" + ), + Path.Combine(videoFolderPath, "DuplicateB.mp4") + ); + }, + assertionsOnZipArchive: paramObj => + { + var zip = paramObj.ZipFile; + Assert.AreNotEqual(-1, zip.FindEntry("video/DuplicateA.mp4", false)); + Assert.AreEqual(-1, zip.FindEntry("video/DuplicateB.mp4", false)); + }, + assertionsOnResultingHtmlString: html => + { + var htmlDom = XmlHtmlConverter.GetXmlDomFromHtml(html); + AssertThatXmlIn + .Dom(htmlDom) + .HasSpecifiedNumberOfMatchesForXpath( + "//source[@src='video/DuplicateA.mp4']", + 2 + ); + AssertThatXmlIn + .Dom(htmlDom) + .HasNoMatchForXpath("//source[@src='video/DuplicateB.mp4']"); + } + ); + } + + [Test] + public void CompressBookForDevice_DeduplicatesDuplicateNonTalkingAudio() + { + const string bodyContent = + @" +
+
one
+
+
+
two
+
+"; + + var testBook = CreateBookWithPhysicalFile( + bodyContent, + kMinimumValidBookHeadContent, + bringBookUpToDate: true + ); + testBook.CollectionSettings.Subscription = Subscription.CreateTempSubscriptionForTier( + SubscriptionTier.Pro + ); + BookStorageTests.MakeSampleAudioFiles(testBook.FolderPath, "duplicate-a", ".mp3"); + BookStorageTests.MakeSampleAudioFiles(testBook.FolderPath, "duplicate-b", ".mp3"); + + PublishHelper.DeDuplicateMediaFiles(testBook.RawDom, testBook.FolderPath); + + AssertThatXmlIn + .Dom(testBook.RawDom) + .HasSpecifiedNumberOfMatchesForXpath( + "//div[@data-backgroundaudio='duplicate-a.mp3']", + 2 + ); + AssertThatXmlIn + .Dom(testBook.RawDom) + .HasNoMatchForXpath("//div[@data-backgroundaudio='duplicate-b.mp3']"); + Assert.That( + RobustFile.Exists(Path.Combine(testBook.FolderPath, "audio", "duplicate-a.mp3")), + Is.True + ); + Assert.That( + RobustFile.Exists(Path.Combine(testBook.FolderPath, "audio", "duplicate-b.mp3")), + Is.False + ); + } + + [Test] + public void CompressBookForDevice_DoesNotDeduplicateTalkingBookAudio() + { + const string bodyContent = + @" +
+
+
+
+

one

+

two

+
+
+
+
+"; + + var testBook = CreateBookWithPhysicalFile( + bodyContent, + kMinimumValidBookHeadContent, + bringBookUpToDate: true + ); + BookStorageTests.MakeSampleAudioFiles(testBook.FolderPath, "audio-id-1", ".mp3"); + BookStorageTests.MakeSampleAudioFiles(testBook.FolderPath, "audio-id-2", ".mp3"); + + PublishHelper.DeDuplicateMediaFiles(testBook.RawDom, testBook.FolderPath); + + AssertThatXmlIn + .Dom(testBook.RawDom) + .HasSpecifiedNumberOfMatchesForXpath( + "//span[@id='audio-id-1' and contains(@class,'audio-sentence')]", + 1 + ); + AssertThatXmlIn + .Dom(testBook.RawDom) + .HasSpecifiedNumberOfMatchesForXpath( + "//span[@id='audio-id-2' and contains(@class,'audio-sentence')]", + 1 + ); + Assert.That( + RobustFile.Exists(Path.Combine(testBook.FolderPath, "audio", "audio-id-1.mp3")), + Is.True + ); + Assert.That( + RobustFile.Exists(Path.Combine(testBook.FolderPath, "audio", "audio-id-2.mp3")), + Is.True + ); + } + private Stream GetEntryContentsStream(ZipFile zip, string name, bool exact = false) { Func predicate;