From 347f9bd1879dc9688a801a099e09d64d9b4d69fe Mon Sep 17 00:00:00 2001 From: maclei Date: Sun, 24 May 2026 23:41:25 +0900 Subject: [PATCH] EPUB exporter: resolve TOC fragments and write embedded font assets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two narrowly-scoped improvements to \`EpubExporter\`: ## TOC fragment resolution AZW3 and MOBI importers leave TOC entries with bare chapter hrefs (\`partNNNN.html\` or \`content.html\`) at open time — the \`#fileposN\` / \`#aid-XXXX\` fragment is populated only when \`Book::resolve_toc\` is called. Previously, the EPUB exporter generated the NCX before calling that, so every TOC entry within a single chapter collapsed onto the same \`partNNNN.html\` href and readers landed on chapter starts instead of the intended in-chapter target. Both \`export_raw\` and \`export_normalized\` now call \`book.resolve_toc()\` before generating the NCX. EPUB importers don't need this (their TOC is already resolved by the importer), so the call is a no-op for that backend. ## Font writing in normalized export \`export_normalized\` writes assets from \`NormalizedContent::assets\`, which only contains resources referenced from the IR DOM. Embedded fonts are typically referenced from CSS \`@font-face\` rules rather than DOM nodes, so they never made it into the normalized asset list and the exported EPUB shipped \`@font-face\` declarations whose \`src:\` URLs pointed at files that were never written into the ZIP. The exporter now snapshots \`book.list_assets()\` before normalization and, after writing the normalized assets, additionally enumerates every \`fonts/\*\` path that wasn't already covered — adding both an OPF manifest entry and a ZIP entry for each. \`export_raw\` is unaffected here because it already writes the full \`book.list_assets()\` set verbatim. ## Tests \`tests/epub_exports_fonts_and_toc.rs\`: - \`epub_export_writes_font_assets_from_kfx\` uses the existing \`tests/fixtures/fonts_only.kfx.gz\` fixture (the one added by #13) to confirm that the three KFX font assets the importer surfaces are written into the exported EPUB's \`OEBPS/fonts/\` directory and referenced in the OPF manifest. - \`epub_export_resolves_azw3_toc_fragments\` uses the existing \`tests/fixtures/epictetus.azw3\` fixture to confirm that the generated \`toc.ncx\` contains resolved \`#aid-XXXX\` fragments rather than the bare chapter hrefs the importer initially produced. Both tests fail without this patch's changes; both pass with them. ## Related work #13 surfaced KFX fonts; this PR makes them survive the EPUB export. For the AZW3 / MOBI side, font extraction is being added in a separate PR — the font-writing logic here is format-agnostic and benefits any importer that surfaces \`fonts/\*\` paths. --- src/export/epub.rs | 59 ++++++++++ tests/epub_exports_fonts_and_toc.rs | 169 ++++++++++++++++++++++++++++ 2 files changed, 228 insertions(+) create mode 100644 tests/epub_exports_fonts_and_toc.rs diff --git a/src/export/epub.rs b/src/export/epub.rs index fd7eef2..6ea08c6 100644 --- a/src/export/epub.rs +++ b/src/export/epub.rs @@ -80,6 +80,14 @@ impl Exporter for EpubExporter { impl EpubExporter { /// Export with passthrough mode (preserves original HTML/CSS). fn export_raw(&self, book: &mut Book, writer: &mut W) -> io::Result<()> { + // Resolve TOC fragments before we generate the NCX. AZW3 and MOBI + // importers leave TOC entries with bare chapter hrefs until + // `resolve_toc()` populates the `#fileposN` / `#id` suffix from the + // book's NCX / position map. Without this call the exported NCX has + // unresolvable anchors and readers land on chapter starts instead of + // the intended in-chapter targets. + book.resolve_toc(); + let mut zip = ZipWriter::new(writer); let compression_level = self.config.compression_level.unwrap_or(6); @@ -184,6 +192,18 @@ impl EpubExporter { ) -> io::Result<()> { use super::normalize::normalize_book; + // Resolve TOC fragments before generating the NCX. Same rationale as + // `export_raw`: AZW3 / MOBI importers leave TOC entries with bare + // chapter hrefs until this is called. + book.resolve_toc(); + + // Snapshot every asset the importer surfaces. The normalized content + // pipeline only records assets it actively references, which misses + // resources like embedded fonts that are referenced from CSS rather + // than from the IR DOM. We add those back into the manifest and ZIP + // below. + let all_assets: Vec<_> = book.list_assets().to_vec(); + // Normalize the book content let content = normalize_book(book)?; @@ -243,6 +263,28 @@ impl EpubExporter { }); } + // Add font assets the importer surfaced that aren't already covered + // by normalized content. Fonts are typically referenced from CSS, not + // from DOM nodes, so `normalize_book` doesn't pull them into + // `content.assets`. Without this we'd emit `@font-face` rules whose + // `src:` URLs point at files we never wrote into the ZIP. + let mut extra_font_idx = 0; + for asset_path in &all_assets { + let path_str = asset_path.to_string_lossy(); + if !path_str.starts_with("fonts/") { + continue; + } + if content.assets.iter().any(|a| a == &*path_str) { + continue; + } + manifest_items.push(ManifestItem { + id: format!("font_{}", extra_font_idx), + href: format!("OEBPS/{}", sanitize_path(&path_str)), + media_type: guess_media_type(&path_str), + }); + extra_font_idx += 1; + } + // 4. Write content.opf let opf = generate_opf(book.metadata(), &manifest_items, &spine_refs); zip.start_file("OEBPS/content.opf", deflated) @@ -280,6 +322,23 @@ impl EpubExporter { } } + // 9. Write font assets not already covered by normalized content. + // Matches the manifest entries added above. + for asset_path in &all_assets { + let path_str = asset_path.to_string_lossy(); + if !path_str.starts_with("fonts/") { + continue; + } + if content.assets.iter().any(|a| a == &*path_str) { + continue; + } + let zip_path = format!("OEBPS/{}", sanitize_path(&path_str)); + if let Ok(data) = book.load_asset(asset_path) { + zip.start_file(&zip_path, deflated).map_err(io_error)?; + zip.write_all(&data)?; + } + } + zip.finish().map_err(io_error)?; Ok(()) } diff --git a/tests/epub_exports_fonts_and_toc.rs b/tests/epub_exports_fonts_and_toc.rs new file mode 100644 index 0000000..84f3cb2 --- /dev/null +++ b/tests/epub_exports_fonts_and_toc.rs @@ -0,0 +1,169 @@ +//! Regression tests for the EPUB exporter's font writing and TOC resolution. +//! +//! Covers two distinct improvements to the EPUB exporter: +//! +//! - Embedded fonts surfaced by the importer (`fonts/font_NNNN.*` asset paths) +//! are now written into the exported EPUB even when the normalized-content +//! pipeline did not pull them into its asset list. Without this, books with +//! custom typography exported a `@font-face` stylesheet whose `src:` URLs +//! pointed at files the exporter never wrote into the ZIP. +//! - `book.resolve_toc()` is now called by the exporter, so TOC entries that +//! importers leave with bare chapter hrefs (AZW3 / MOBI) get their fragment +//! suffix populated before the NCX is generated. + +use std::io::{Cursor, Read}; +use std::path::Path; + +use boko::Book; +use boko::export::{EpubExporter, Exporter}; +use flate2::read::GzDecoder; +use zip::ZipArchive; + +/// Decompress the `fonts_only.kfx.gz` fixture to a temp file. +/// +/// The fixture is a stripped KFX containing only `bcRawFont` and `Font` +/// entities — exactly the shape that exercises the importer's font discovery +/// pipeline added in #13. +fn decompress_fonts_only_fixture() -> Option { + let gz_path = "tests/fixtures/fonts_only.kfx.gz"; + if !Path::new(gz_path).exists() { + eprintln!("Skipping test - fixture not found: {gz_path}"); + return None; + } + + let gz_data = std::fs::read(gz_path).expect("read fixture"); + let mut decoder = GzDecoder::new(&gz_data[..]); + let mut kfx_data = Vec::new(); + decoder.read_to_end(&mut kfx_data).expect("decompress"); + + let mut tmp = tempfile::Builder::new() + .suffix(".kfx") + .tempfile() + .expect("temp file"); + std::io::Write::write_all(&mut tmp, &kfx_data).expect("write temp"); + Some(tmp) +} + +#[test] +fn epub_export_writes_font_assets_from_kfx() { + let Some(tmp) = decompress_fonts_only_fixture() else { + return; + }; + + let mut book = Book::open(tmp.path()).expect("open KFX fonts_only fixture"); + + // Sanity: the importer surfaces the fonts via list_assets. + let surfaced_fonts: Vec<_> = book + .list_assets() + .iter() + .filter(|p| p.to_string_lossy().starts_with("fonts/")) + .cloned() + .collect(); + assert_eq!( + surfaced_fonts.len(), + 3, + "importer should expose 3 font assets, got {surfaced_fonts:?}", + ); + + // Export to an in-memory EPUB. + let mut buf = Cursor::new(Vec::::new()); + EpubExporter::new() + .export(&mut book, &mut buf) + .expect("epub export"); + + let epub_bytes = buf.into_inner(); + let mut zip = ZipArchive::new(Cursor::new(epub_bytes)).expect("open epub zip"); + + // The exporter must write every font asset the importer surfaced. + for asset in &surfaced_fonts { + let zip_path = format!("OEBPS/{}", asset.to_string_lossy()); + let mut entry = zip + .by_name(&zip_path) + .unwrap_or_else(|_| panic!("missing {zip_path} in exported EPUB")); + let mut data = Vec::new(); + entry.read_to_end(&mut data).expect("read font entry"); + assert!( + data.len() > 1000, + "{zip_path} truncated: only {} bytes", + data.len() + ); + } + + // The OPF manifest must reference each font asset as well. The OPF emits + // hrefs relative to its own location (OEBPS/), so we look for the bare + // `fonts/font_NNNN.*` path, not the `OEBPS/...` ZIP path. + let mut opf = String::new(); + zip.by_name("OEBPS/content.opf") + .expect("OEBPS/content.opf present") + .read_to_string(&mut opf) + .expect("read opf"); + for asset in &surfaced_fonts { + let needle = asset.to_string_lossy().to_string(); + assert!( + opf.contains(&needle), + "content.opf manifest missing href \"{needle}\"" + ); + } +} + +#[test] +fn epub_export_resolves_azw3_toc_fragments() { + let azw3_path = "tests/fixtures/epictetus.azw3"; + if !Path::new(azw3_path).exists() { + eprintln!("Skipping test - fixture not found: {azw3_path}"); + return; + } + + // Sanity check: the AZW3 importer leaves TOC entries with bare chapter + // hrefs at open time — every Enchiridion section initially points at + // part0002.html with no fragment. + { + let book = Book::open(azw3_path).expect("open epictetus.azw3"); + let mut enchiridion_section_hrefs = Vec::new(); + for entry in book.toc() { + if entry.title == "The Enchiridion" { + for child in &entry.children { + enchiridion_section_hrefs.push(child.href.clone()); + } + break; + } + } + assert!( + !enchiridion_section_hrefs.is_empty(), + "expected nested Enchiridion sections" + ); + for href in &enchiridion_section_hrefs { + assert!( + !href.contains('#'), + "expected bare chapter href before resolve_toc, got {href}" + ); + } + } + + // Export to in-memory EPUB. The exporter must call `resolve_toc()` + // internally so the generated NCX has fragments populated. + let mut book = Book::open(azw3_path).expect("re-open epictetus.azw3"); + let mut buf = Cursor::new(Vec::::new()); + EpubExporter::new() + .export(&mut book, &mut buf) + .expect("epub export"); + + let epub_bytes = buf.into_inner(); + let mut zip = ZipArchive::new(Cursor::new(epub_bytes)).expect("open epub zip"); + + let mut ncx = String::new(); + zip.by_name("OEBPS/toc.ncx") + .expect("OEBPS/toc.ncx present") + .read_to_string(&mut ncx) + .expect("read ncx"); + + // Resolved entries get an `#aid-XXXX` fragment populated from the NCX + // index. The pre-fix exporter produced bare `partNNNN.html` hrefs with + // no fragment, so any non-trivial count here proves resolve_toc ran. + let aid_refs = ncx.matches("#aid-").count(); + assert!( + aid_refs > 10, + "expected resolved #aid- fragments in NCX, got only {aid_refs}; \ + resolve_toc likely didn't run during export. NCX:\n{ncx}", + ); +}