diff --git a/index.js b/index.js index 4614cb3..b7fc780 100755 --- a/index.js +++ b/index.js @@ -12,8 +12,6 @@ import validateNames from 'jsdom/lib/jsdom/living/helpers/validate-names.js'; import nunjucks from 'nunjucks'; import css from 'css'; -import { Readability } from '@mozilla/readability'; -import createDOMPurify from 'dompurify'; import MimeType from 'whatwg-mimetype'; /* @@ -35,28 +33,16 @@ import humanDate from './src/util/human-date.js'; import outputPath from './src/util/output-path.js'; import getCssPageFormat from './src/util/get-css-page-format.js'; import { resolveSequence, resolveParallel } from './src/util/promises.js'; -import { getUrlOrigin } from './src/util/url-origin.js'; import addExif from './src/exif.js'; -import { hyphenateDom } from './src/hyphenate.js'; -import { textToIso6391, getLanguageAttribute } from './src/util/language.js'; -import { setIdsAndReturnHeadings, nestHeadings } from './src/headings.js'; -import { - ampToHtml, - fixLazyLoadedImages, - imagesAtFullSize, - wikipediaSpecific, - noUselessHref, - relativeToAbsoluteURIs, - singleImgToFigure, - expandDetailsElements, - githubSpecific, - wrapPreBlocks -} from './src/enhancements.js'; -import mapRemoteResources from './src/remote-resources.js'; -import inlineImages from './src/inline-images.js'; +import { isURL } from './src/util/url.js'; +import { isFeed, processFeed } from './src/feeds.js'; + import get_style_attribute_value from './src/get-style-attribute-value.js'; +import cleanupItem from './src/cleanup-item.js'; +import wrapHTMLFragment from './src/util/wrap-html-fragment.js'; + const out = process.stdout; const err = process.stderr; @@ -74,24 +60,6 @@ const JUSTIFY_CSS = ` } `; -const enhancePage = function (dom) { - // Note: the order of the enhancements matters! - [ - ampToHtml, - fixLazyLoadedImages, - relativeToAbsoluteURIs, - imagesAtFullSize, - singleImgToFigure, - noUselessHref, - expandDetailsElements, - wikipediaSpecific, - githubSpecific, - wrapPreBlocks - ].forEach(enhancement => { - enhancement(dom.window.document); - }); -}; - /* Some setup ---------- @@ -99,7 +67,11 @@ const enhancePage = function (dom) { let configured = false; function configure() { if (!configured) { - nunjucks.configure({ autoescape: false, noCache: true }); + const env = nunjucks.configure({ + autoescape: false, + noCache: true + }); + env.addFilter('humandate', humanDate); configured = true; } } @@ -141,16 +113,6 @@ function launch(options, size) { ----------------------------------- */ -function isURL(ref) { - try { - new URL(ref); - return true; - } catch (err) { - // no-op - } - return false; -} - async function fetchContent(ref, fetchOptions = {}) { if (ref instanceof stream.Readable) { return { @@ -248,152 +210,65 @@ async function cleanup(url, options) { url: final_url }); - // Force relative URL resolution - dom.window.document.body.setAttribute(null, null); - - const sanitizer = createDOMPurify(dom.window); - - const amp = dom.window.document.querySelector('link[rel~=amphtml]'); - if (amp && options.amp) { - err.write('\nFound AMP version (use `--no-amp` to ignore)\n'); - return cleanup(amp.href, options, amp.href); - } + const doc = dom.window.document; - err.write(`Enhancing web page: ${url}`); + // Stop-gap solution as some of these are still referenced in cleanupItem() + const ENV = { + err, + out, + UA + }; - /* - Run enhancements - ---------------- + /* + If the file is a valid RSS/Atom feed + extract the feed entries to be processed further. */ - enhancePage(dom); - - // Run through readability and return - const R = new Readability(dom.window.document, { - classesToPreserve: [ - 'no-href', - - /* - Placed on some elements - as in-page anchors - */ - 'anchor' - ], - /* - Change Readability's serialization to return - a DOM element (instead of a HTML string) - as the `.content` property returned from `.parse()` - - This makes it easier for us to run subsequent - transformations (sanitization, hyphenation, etc.) - without having to parse/serialize the HTML repeatedly. - */ - serializer: el => el - }); - - // TODO: find better solution to prevent Readability from - // making img srcs relative. - if (options.mapRemoteResources || options.inline) { - R._fixRelativeUris = () => {}; - } - - const parsed = R.parse() || {}; - - let remoteResources; - if (options.mapRemoteResources) { - remoteResources = mapRemoteResources(parsed.content); - } - - // Hyphenate the text - const textContent = sanitizer.sanitize(parsed.textContent); - const lang = - getLanguageAttribute(dom.window.document) || - textToIso6391(textContent); - - err.write(' ✓\n'); - if (options.inline) { - await inlineImages( - parsed.content, - { - headers: { - 'user-agent': UA - }, - /* - Send the referrer as the browser would - when fetching the image to render it. - - The referrer policy would take care of - stripping the URL down to its origin, - but just in case, let’s strip it ourselves. - */ - referrer: getUrlOrigin(final_url), - referrerPolicy: 'strict-origin-when-cross-origin', - timeout: 10 * 1000 + if (isFeed(doc)) { + const w = options.wait * 1000; + const entries = processFeed(doc); + return resolveSequence( + entries, + async entry => { + const itemDOM = new JSDOM(wrapHTMLFragment(entry), { + url: entry.url + }); + entry.content = itemDOM.window.document.body; + const clean = await cleanupItem( + itemDOM, + options, + entry, + ENV + ); + return { + ...clean, + // TODO + originalContent: { + buffer: null, + contentType: 'text/html' + } + }; }, - options.debug ? out : undefined + Number.isFinite(w) && w >= 0 ? w : 0 ); } /* - Select the appropriate serialization method - based on the bundle target. EPUBs need the - content to be XHTML (produced by a XML serializer), - rather than normal HTML. - */ - const serializer = options.xhtml - ? arr => { - const xs = new dom.window.XMLSerializer(); - return arr.map(el => xs.serializeToString(el)).join(''); - } - : arr => arr.map(el => el.innerHTML).join(''); - - /* - When dompurify returns a DOM node, it always wraps it - in a HTMLBodyElement. We only need its children. - */ - const sanitize_to_dom = dirty => - Array.from( - sanitizer.sanitize(dirty, { RETURN_DOM: true }).children - ); - - const content_els = sanitize_to_dom(parsed.content); - - // `--toc-level` implies `--toc`, unless disabled with `--no-toc`. - let headings = []; - if (options['toc-level'] > 1 && options.toc !== false) { - headings = setIdsAndReturnHeadings( - content_els, - options['toc-level'] - ).map(heading => { - return { - id: heading.id, - level: heading.level, - // Plain text used in EPUB - text: heading.node.textContent.trim(), - // Sanitized marked-up text used in HTML/PDF - content: serializer([heading.node]) - }; - }); + Use the AMP version of the article + if one is available and the user has not opted out. + */ + const amp = doc.querySelector('link[rel~=amphtml]'); + if (amp && options.amp) { + err.write('\nFound AMP version (use `--no-amp` to ignore)\n'); + return cleanup(amp.href, options, amp.href); } return { - id: `percollate-page-${uuid()}`, - url: final_url, - title: sanitizer.sanitize(parsed.title), - byline: sanitizer.sanitize(parsed.byline), - dir: sanitizer.sanitize(parsed.dir), - excerpt: serializer(sanitize_to_dom(parsed.excerpt)), - content: serializer( - options.hyphenate === true - ? content_els.map(el => hyphenateDom(el, lang)) - : content_els - ), - lang, - textContent, - toc: nestHeadings(headings || []), - length: parsed.length, - siteName: sanitizer.sanitize(parsed.siteName), - remoteResources, + ...(await cleanupItem(dom, options, null, ENV)), + /* + Augument for original content, useful when + percollate is used as an API. + */ originalContent: { buffer, contentType @@ -438,7 +313,7 @@ async function bundlePdf(items, options) { filetype: 'pdf', title, author, - date: humanDate(new Date()), + date: new Date(), items, style, options: { @@ -636,7 +511,7 @@ async function bundleHtml(items, options) { title: options.title || (items.length === 1 ? items[0].title : 'Untitled'), - date: humanDate(new Date()), + date: new Date(), items, style, options: { @@ -691,7 +566,7 @@ async function bundleMd(items, options) { title: options.title || (items.length === 1 ? items[0].title : 'Untitled'), - date: humanDate(new Date()), + date: new Date(), items, style, options: { @@ -762,7 +637,9 @@ async function generate(fn, urls, options = {}) { }, w ) - ).filter(it => it); + ) + .filter(it => it) + .flat(); if (options.individual) { await Promise.all(items.map(item => fn([item], options))); @@ -995,6 +872,5 @@ async function epubgen(data, output_path, options) { export { configure, pdf, epub, html, md }; export const __test__ = { - fetchContent, - isURL + fetchContent }; diff --git a/src/cleanup-item.js b/src/cleanup-item.js new file mode 100644 index 0000000..8f7168e --- /dev/null +++ b/src/cleanup-item.js @@ -0,0 +1,208 @@ +import { randomUUID as uuid } from 'node:crypto'; + +import { Readability } from '@mozilla/readability'; +import createDOMPurify from 'dompurify'; + +import { hyphenateDom } from './hyphenate.js'; +import { textToIso6391, getLanguageAttribute } from './util/language.js'; +import { getURLOrigin } from './util/url.js'; +import { setIdsAndReturnHeadings, nestHeadings } from './headings.js'; + +import { + ampToHtml, + fixLazyLoadedImages, + imagesAtFullSize, + wikipediaSpecific, + noUselessHref, + relativeToAbsoluteURIs, + singleImgToFigure, + expandDetailsElements, + githubSpecific, + wrapPreBlocks +} from './enhancements.js'; +import mapRemoteResources from './remote-resources.js'; +import inlineImages from './inline-images.js'; + +/* + + The `parsedContent` can be populated, e.g. from feed entries. + The following structure is required: + + { + content: JSDOM instance, + title: + byline: + dir: + excerpt: + length: + siteName: + } + +*/ +export default async function cleanupItem( + dom, + options, + parsedContent = null, + env +) { + const doc = dom.window.document; + const url = dom.window.location.href; + + const sanitizer = createDOMPurify(dom.window); + + // Force relative URL resolution + doc.body.setAttribute(null, null); + + /* + Run DOM enhancements + */ + env.err.write(`Enhancing web page: ${url}`); + enhancePage(doc); + + let parsed; + + if (parsedContent) { + parsed = parsedContent; + } else { + /* + Run through Readability + */ + const R = new Readability(doc, { + classesToPreserve: [ + 'no-href', + // Placed on some elements as in-page anchors + 'anchor' + ], + /* + Change Readability's serialization to return + a DOM element (instead of a HTML string) + as the `.content` property returned from `.parse()` + + This makes it easier for us to run subsequent + transformations (sanitization, hyphenation, etc.) + without having to parse/serialize the HTML repeatedly. + */ + serializer: el => el + }); + + // TODO: find better solution to prevent Readability from + // making img srcs relative. + if (options.mapRemoteResources || options.inline) { + R._fixRelativeUris = () => {}; + } + + parsed = R.parse() || {}; + } + + const remoteResources = options.mapRemoteResources + ? mapRemoteResources(parsed.content) + : null; + + const textContent = sanitizer.sanitize( + parsed.textContent || parsed.content.textContent + ); + const lang = getLanguageAttribute(doc) || textToIso6391(textContent); + + env.err.write(' ✓\n'); + + if (options.inline) { + await inlineImages( + parsed.content, + { + headers: { + 'user-agent': env.UA + }, + /* + Send the referrer as the browser would + when fetching the image to render it. + + The referrer policy would take care of + stripping the URL down to its origin, + but just in case, let’s strip it ourselves. + */ + referrer: getURLOrigin(url), + referrerPolicy: 'strict-origin-when-cross-origin', + timeout: 10 * 1000 + }, + options.debug ? env.out : undefined + ); + } + + /* + Select the appropriate serialization method + based on the bundle target. EPUBs need the + content to be XHTML (produced by a XML serializer), + rather than normal HTML. + */ + const serializer = options.xhtml + ? arr => { + const xs = new dom.window.XMLSerializer(); + return arr.map(el => xs.serializeToString(el)).join(''); + } + : arr => arr.map(el => el.innerHTML).join(''); + + /* + When dompurify returns a DOM node, it always wraps it + in a HTMLBodyElement. We only need its children. + */ + const sanitize_to_dom = dirty => + Array.from(sanitizer.sanitize(dirty, { RETURN_DOM: true }).children); + + const content_els = sanitize_to_dom(parsed.content); + + // `--toc-level` implies `--toc`, unless disabled with `--no-toc`. + let headings = []; + if (options['toc-level'] > 1 && options.toc !== false) { + headings = setIdsAndReturnHeadings( + content_els, + options['toc-level'] + ).map(heading => { + return { + id: heading.id, + level: heading.level, + // Plain text used in EPUB + text: heading.node.textContent.trim(), + // Sanitized marked-up text used in HTML/PDF + content: serializer([heading.node]) + }; + }); + } + + return { + id: `percollate-page-${uuid()}`, + url, + title: sanitizer.sanitize(parsed.title), + byline: sanitizer.sanitize(parsed.byline), + published: sanitizer.sanitize(parsed.published || parsed.publishedTime), + updated: sanitizer.sanitize(parsed.updated), + dir: sanitizer.sanitize(parsed.dir), + excerpt: serializer(sanitize_to_dom(parsed.excerpt)), + content: serializer( + options.hyphenate === true + ? content_els.map(el => hyphenateDom(el, lang)) + : content_els + ), + lang, + textContent, + toc: nestHeadings(headings || []), + length: parsed.length, + siteName: sanitizer.sanitize(parsed.siteName), + remoteResources + }; +} + +function enhancePage(doc) { + // Note: the order of the enhancements matters! + [ + ampToHtml, + fixLazyLoadedImages, + relativeToAbsoluteURIs, + imagesAtFullSize, + singleImgToFigure, + noUselessHref, + expandDetailsElements, + wikipediaSpecific, + githubSpecific, + wrapPreBlocks + ].forEach(fn => fn(doc)); +} diff --git a/src/feeds.js b/src/feeds.js new file mode 100644 index 0000000..b436687 --- /dev/null +++ b/src/feeds.js @@ -0,0 +1,101 @@ +import { isURL } from './util/url.js'; + +/* + Specification: + + RFC 4287: The Atom Syndication Format + https://datatracker.ietf.org/doc/html/rfc4287 +*/ +function isAtomFeed(doc) { + return ( + doc.documentElement.localName === 'feed' && + doc.documentElement.namespaceURI === 'http://www.w3.org/2005/Atom' + ); +} + +/* + Specification: + + RSS 2.0 Specification + https://www.rssboard.org/rss-specification +*/ +function isRssFeed(doc) { + return doc.documentElement.localName === 'rss'; +} + +function processAtomFeed(doc) { + const feedLink = doc + .querySelector('feed > link:not([rel]), feed > link[rel=alternate]') + ?.getAttribute('href'); + const feedAuthor = doc.querySelector('feed > author name')?.textContent; + const entries = Array.from(doc.querySelectorAll('feed > entry')).map( + entry => { + const ret = { + title: entry.querySelector('title')?.textContent ?? '', + published: entry.querySelector('published')?.textContent, + updated: entry.querySelector('updated')?.textContent, + byline: + entry.querySelector('author name')?.textContent ?? + feedAuthor, + url: entry + .querySelector('link:not([rel]), link[rel=alternate]') + ?.getAttribute('href'), + content: entry.querySelector('content')?.textContent ?? '' + }; + if (isURL(ret.link)) { + // Resolve relative entry link, TODO: also use xml:base + ret.link = new URL(ret.link, feedLink).href; + } + if (ret.updated && !ret.published) { + ret.published = ret.updated; + } + return ret; + } + ); + return entries; +} + +function processRssFeed(doc) { + const feedLink = doc.querySelector( + 'channel > link:not([rel]), channel > link[rel=alternate]' + )?.textContent; + const feedAuthor = doc.querySelector('channel > author')?.textContent; + const entries = Array.from(doc.querySelectorAll('channel > item')).map( + entry => { + const ret = { + title: entry.querySelector('title')?.textContent ?? '', + published: entry.querySelector('pubDate')?.textContent, + updated: entry.querySelector('atom\\:updated')?.textContent, + byline: + entry.querySelector('author')?.textContent ?? feedAuthor, + url: + entry.querySelector('link:not([rel]), link[rel=alternate]') + ?.textContent ?? '', + content: entry.querySelector('description')?.textContent ?? '' + }; + if (isURL(ret.link)) { + // Resolve relative entry link, TODO: also use xml:base + ret.link = new URL(ret.link, feedLink).href; + } + if (ret.updated && !ret.published) { + ret.published = ret.updated; + } + return ret; + } + ); + return entries; +} + +export function isFeed(doc) { + return isAtomFeed(doc) || isRssFeed(doc); +} + +export function processFeed(doc) { + if (isAtomFeed(doc)) { + return processAtomFeed(doc); + } + if (isRssFeed(doc)) { + return processRssFeed(doc); + } + return null; +} diff --git a/src/remote-resources.js b/src/remote-resources.js index e1860a1..504eb05 100644 --- a/src/remote-resources.js +++ b/src/remote-resources.js @@ -5,7 +5,7 @@ import { extForMimetype, isImageURL } from './util/file-mimetype.js'; -import { getUrlOrigin } from './util/url-origin.js'; +import { getURLOrigin } from './util/url.js'; export default function remoteResources(doc) { let srcs = new Map(); @@ -31,7 +31,7 @@ export default function remoteResources(doc) { srcs.set(src, { original: src, mapped: `rr-${uuid()}${ext}`, - origin: getUrlOrigin(doc.baseURI), + origin: getURLOrigin(doc.baseURI), mimetype: mime }); } diff --git a/src/util/human-date.js b/src/util/human-date.js index 2b57485..cc0fe62 100644 --- a/src/util/human-date.js +++ b/src/util/human-date.js @@ -1,6 +1,13 @@ -export default function humanDate(d) { - const pad = num => (num < 10 ? '0' + num : num); - return `${pad(d.getUTCFullYear())}-${pad(d.getUTCMonth() + 1)}-${pad( - d.getUTCDate() - )}`; +const dateTimeFormatter = new Intl.DateTimeFormat('en', { + dateStyle: 'medium', + timeStyle: 'short' +}); + +const dateFormatter = new Intl.DateTimeFormat('en', { + dateStyle: 'medium' +}); + +export default function humanDate(date, includeTime = false) { + const d = new Date(date); + return includeTime ? dateTimeFormatter.format(d) : dateFormatter.format(d); } diff --git a/src/util/output-path.js b/src/util/output-path.js index ec7b651..ef15e78 100644 --- a/src/util/output-path.js +++ b/src/util/output-path.js @@ -27,7 +27,7 @@ export default function (items, options = {}, ext, cache = {}) { if (options.output) { return options.output; } - if (items.length > 1) { + if (!items.length || items.length > 1) { return `percollate-${Date.now()}${ext || ''}`; } return slugifyTitle(items[0].title, cache) + (ext || ''); diff --git a/src/util/url-origin.js b/src/util/url-origin.js deleted file mode 100644 index e2799c2..0000000 --- a/src/util/url-origin.js +++ /dev/null @@ -1,9 +0,0 @@ -export function getUrlOrigin(str) { - let origin; - try { - origin = new URL(str).origin; - } catch (err) { - // ignore - } - return origin && origin !== 'null' ? origin : undefined; -} diff --git a/src/util/url.js b/src/util/url.js new file mode 100644 index 0000000..995476d --- /dev/null +++ b/src/util/url.js @@ -0,0 +1,20 @@ +// Polyfill for URL.canParse(), which was added in later Node.js versions +export function isURL(ref) { + try { + new URL(ref); + return true; + } catch (err) { + // no-op + } + return false; +} + +export function getURLOrigin(str) { + let origin; + try { + origin = new URL(str).origin; + } catch (err) { + // ignore + } + return origin && origin !== 'null' ? origin : undefined; +} diff --git a/src/util/wrap-html-fragment.js b/src/util/wrap-html-fragment.js new file mode 100644 index 0000000..d3eb895 --- /dev/null +++ b/src/util/wrap-html-fragment.js @@ -0,0 +1,18 @@ +/* + A minimal HTML shell for a DOM fragment, + such as the content of a XML feed entry, + to pass to the JSDOM constructor. +*/ +export default function wrapHTMLFragment(item) { + return ` + + + + ${item.title || ''} + + +
${item.content || ''}
+ + + `; +} diff --git a/templates/default.css b/templates/default.css index 1f5c14e..fbdc655 100644 --- a/templates/default.css +++ b/templates/default.css @@ -117,6 +117,7 @@ article { } article:not(:last-of-type) { + margin-bottom: 2em; page-break-after: always; } @@ -136,9 +137,12 @@ article:not(:last-of-type) { line-height: 1.1; } +.article__byline, +.article__time, .article__url { font-style: italic; font-size: 0.9em; + margin: 0; } /* @@ -148,6 +152,7 @@ article:not(:last-of-type) { .article__content img { max-width: 100%; + height: auto; display: block; margin: 0 auto; } diff --git a/templates/default.html b/templates/default.html index daefd0a..1a2d5d2 100644 --- a/templates/default.html +++ b/templates/default.html @@ -31,7 +31,7 @@

{{ title }}

{{ date | humandate }}

@@ -57,6 +57,18 @@

Table of Contents

{{ item.title }}

{% if item.byline %}

By {{ item.byline }}

+ {% endif %} {% if item.published %} +

+ Published + {% if item.updated and item.updated !== item.published %} + · Updated + + {% endif %} +

{% endif %}

Source: diff --git a/test/url-origin.test.js b/test/url-origin.test.js deleted file mode 100644 index 63388a1..0000000 --- a/test/url-origin.test.js +++ /dev/null @@ -1,9 +0,0 @@ -import tape from 'tape'; -import { getUrlOrigin } from '../src/util/url-origin.js'; - -tape('getUrlOrigin', t => { - t.equal(getUrlOrigin('invalid'), undefined); - t.equal(getUrlOrigin('file:///Users/myuser/'), undefined); - t.equal(getUrlOrigin('https://github.com/user/repo'), 'https://github.com'); - t.end(); -}); diff --git a/test/url.test.js b/test/url.test.js new file mode 100644 index 0000000..029a2f7 --- /dev/null +++ b/test/url.test.js @@ -0,0 +1,16 @@ +import tape from 'tape'; +import { getURLOrigin, isURL } from '../src/util/url.js'; + +tape('getURLOrigin', t => { + t.equal(getURLOrigin('invalid'), undefined); + t.equal(getURLOrigin('file:///Users/myuser/'), undefined); + t.equal(getURLOrigin('https://github.com/user/repo'), 'https://github.com'); + t.end(); +}); + +tape('isURL', t => { + t.equal(isURL('invalid'), false); + t.equal(isURL('file:///Users/myuser/'), true); + t.equal(isURL('https://github.com/user/repo'), true); + t.end(); +});