elements as in-page anchors
+ 'anchor'
+ ],
+ /*
+ Change Readability's serialization to return
+ a DOM element (instead of a HTML string)
+ as the `.content` property returned from `.parse()`
+
+ This makes it easier for us to run subsequent
+ transformations (sanitization, hyphenation, etc.)
+ without having to parse/serialize the HTML repeatedly.
+ */
+ serializer: el => el
+ });
+
+ // TODO: find better solution to prevent Readability from
+ // making img srcs relative.
+ if (options.mapRemoteResources || options.inline) {
+ R._fixRelativeUris = () => {};
+ }
+
+ parsed = R.parse() || {};
+ }
+
+ const remoteResources = options.mapRemoteResources
+ ? mapRemoteResources(parsed.content)
+ : null;
+
+ const textContent = sanitizer.sanitize(
+ parsed.textContent || parsed.content.textContent
+ );
+ const lang = getLanguageAttribute(doc) || textToIso6391(textContent);
+
+ env.err.write(' ✓\n');
+
+ if (options.inline) {
+ await inlineImages(
+ parsed.content,
+ {
+ headers: {
+ 'user-agent': env.UA
+ },
+ /*
+ Send the referrer as the browser would
+ when fetching the image to render it.
+
+ The referrer policy would take care of
+ stripping the URL down to its origin,
+ but just in case, let’s strip it ourselves.
+ */
+ referrer: getURLOrigin(url),
+ referrerPolicy: 'strict-origin-when-cross-origin',
+ timeout: 10 * 1000
+ },
+ options.debug ? env.out : undefined
+ );
+ }
+
+ /*
+ Select the appropriate serialization method
+ based on the bundle target. EPUBs need the
+ content to be XHTML (produced by a XML serializer),
+ rather than normal HTML.
+ */
+ const serializer = options.xhtml
+ ? arr => {
+ const xs = new dom.window.XMLSerializer();
+ return arr.map(el => xs.serializeToString(el)).join('');
+ }
+ : arr => arr.map(el => el.innerHTML).join('');
+
+ /*
+ When dompurify returns a DOM node, it always wraps it
+ in a HTMLBodyElement. We only need its children.
+ */
+ const sanitize_to_dom = dirty =>
+ Array.from(sanitizer.sanitize(dirty, { RETURN_DOM: true }).children);
+
+ const content_els = sanitize_to_dom(parsed.content);
+
+ // `--toc-level` implies `--toc`, unless disabled with `--no-toc`.
+ let headings = [];
+ if (options['toc-level'] > 1 && options.toc !== false) {
+ headings = setIdsAndReturnHeadings(
+ content_els,
+ options['toc-level']
+ ).map(heading => {
+ return {
+ id: heading.id,
+ level: heading.level,
+ // Plain text used in EPUB
+ text: heading.node.textContent.trim(),
+ // Sanitized marked-up text used in HTML/PDF
+ content: serializer([heading.node])
+ };
+ });
+ }
+
+ return {
+ id: `percollate-page-${uuid()}`,
+ url,
+ title: sanitizer.sanitize(parsed.title),
+ byline: sanitizer.sanitize(parsed.byline),
+ published: sanitizer.sanitize(parsed.published || parsed.publishedTime),
+ updated: sanitizer.sanitize(parsed.updated),
+ dir: sanitizer.sanitize(parsed.dir),
+ excerpt: serializer(sanitize_to_dom(parsed.excerpt)),
+ content: serializer(
+ options.hyphenate === true
+ ? content_els.map(el => hyphenateDom(el, lang))
+ : content_els
+ ),
+ lang,
+ textContent,
+ toc: nestHeadings(headings || []),
+ length: parsed.length,
+ siteName: sanitizer.sanitize(parsed.siteName),
+ remoteResources
+ };
+}
+
+function enhancePage(doc) {
+ // Note: the order of the enhancements matters!
+ [
+ ampToHtml,
+ fixLazyLoadedImages,
+ relativeToAbsoluteURIs,
+ imagesAtFullSize,
+ singleImgToFigure,
+ noUselessHref,
+ expandDetailsElements,
+ wikipediaSpecific,
+ githubSpecific,
+ wrapPreBlocks
+ ].forEach(fn => fn(doc));
+}
diff --git a/src/feeds.js b/src/feeds.js
new file mode 100644
index 0000000..b436687
--- /dev/null
+++ b/src/feeds.js
@@ -0,0 +1,101 @@
+import { isURL } from './util/url.js';
+
+/*
+ Specification:
+
+ RFC 4287: The Atom Syndication Format
+ https://datatracker.ietf.org/doc/html/rfc4287
+*/
+function isAtomFeed(doc) {
+ return (
+ doc.documentElement.localName === 'feed' &&
+ doc.documentElement.namespaceURI === 'http://www.w3.org/2005/Atom'
+ );
+}
+
+/*
+ Specification:
+
+ RSS 2.0 Specification
+ https://www.rssboard.org/rss-specification
+*/
+function isRssFeed(doc) {
+ return doc.documentElement.localName === 'rss';
+}
+
+function processAtomFeed(doc) {
+ const feedLink = doc
+ .querySelector('feed > link:not([rel]), feed > link[rel=alternate]')
+ ?.getAttribute('href');
+ const feedAuthor = doc.querySelector('feed > author name')?.textContent;
+ const entries = Array.from(doc.querySelectorAll('feed > entry')).map(
+ entry => {
+ const ret = {
+ title: entry.querySelector('title')?.textContent ?? '',
+ published: entry.querySelector('published')?.textContent,
+ updated: entry.querySelector('updated')?.textContent,
+ byline:
+ entry.querySelector('author name')?.textContent ??
+ feedAuthor,
+ url: entry
+ .querySelector('link:not([rel]), link[rel=alternate]')
+ ?.getAttribute('href'),
+ content: entry.querySelector('content')?.textContent ?? ''
+ };
+ if (isURL(ret.link)) {
+ // Resolve relative entry link, TODO: also use xml:base
+ ret.link = new URL(ret.link, feedLink).href;
+ }
+ if (ret.updated && !ret.published) {
+ ret.published = ret.updated;
+ }
+ return ret;
+ }
+ );
+ return entries;
+}
+
+function processRssFeed(doc) {
+ const feedLink = doc.querySelector(
+ 'channel > link:not([rel]), channel > link[rel=alternate]'
+ )?.textContent;
+ const feedAuthor = doc.querySelector('channel > author')?.textContent;
+ const entries = Array.from(doc.querySelectorAll('channel > item')).map(
+ entry => {
+ const ret = {
+ title: entry.querySelector('title')?.textContent ?? '',
+ published: entry.querySelector('pubDate')?.textContent,
+ updated: entry.querySelector('atom\\:updated')?.textContent,
+ byline:
+ entry.querySelector('author')?.textContent ?? feedAuthor,
+ url:
+ entry.querySelector('link:not([rel]), link[rel=alternate]')
+ ?.textContent ?? '',
+ content: entry.querySelector('description')?.textContent ?? ''
+ };
+ if (isURL(ret.link)) {
+ // Resolve relative entry link, TODO: also use xml:base
+ ret.link = new URL(ret.link, feedLink).href;
+ }
+ if (ret.updated && !ret.published) {
+ ret.published = ret.updated;
+ }
+ return ret;
+ }
+ );
+ return entries;
+}
+
+export function isFeed(doc) {
+ return isAtomFeed(doc) || isRssFeed(doc);
+}
+
+export function processFeed(doc) {
+ if (isAtomFeed(doc)) {
+ return processAtomFeed(doc);
+ }
+ if (isRssFeed(doc)) {
+ return processRssFeed(doc);
+ }
+ return null;
+}
diff --git a/src/remote-resources.js b/src/remote-resources.js
index e1860a1..504eb05 100644
--- a/src/remote-resources.js
+++ b/src/remote-resources.js
@@ -5,7 +5,7 @@ import {
extForMimetype,
isImageURL
} from './util/file-mimetype.js';
-import { getUrlOrigin } from './util/url-origin.js';
+import { getURLOrigin } from './util/url.js';
export default function remoteResources(doc) {
let srcs = new Map();
@@ -31,7 +31,7 @@ export default function remoteResources(doc) {
srcs.set(src, {
original: src,
mapped: `rr-${uuid()}${ext}`,
- origin: getUrlOrigin(doc.baseURI),
+ origin: getURLOrigin(doc.baseURI),
mimetype: mime
});
}
diff --git a/src/util/human-date.js b/src/util/human-date.js
index 2b57485..cc0fe62 100644
--- a/src/util/human-date.js
+++ b/src/util/human-date.js
@@ -1,6 +1,13 @@
-export default function humanDate(d) {
- const pad = num => (num < 10 ? '0' + num : num);
- return `${pad(d.getUTCFullYear())}-${pad(d.getUTCMonth() + 1)}-${pad(
- d.getUTCDate()
- )}`;
+const dateTimeFormatter = new Intl.DateTimeFormat('en', {
+ dateStyle: 'medium',
+ timeStyle: 'short'
+});
+
+const dateFormatter = new Intl.DateTimeFormat('en', {
+ dateStyle: 'medium'
+});
+
+export default function humanDate(date, includeTime = false) {
+ const d = new Date(date);
+ return includeTime ? dateTimeFormatter.format(d) : dateFormatter.format(d);
}
diff --git a/src/util/output-path.js b/src/util/output-path.js
index ec7b651..ef15e78 100644
--- a/src/util/output-path.js
+++ b/src/util/output-path.js
@@ -27,7 +27,7 @@ export default function (items, options = {}, ext, cache = {}) {
if (options.output) {
return options.output;
}
- if (items.length > 1) {
+ if (!items.length || items.length > 1) {
return `percollate-${Date.now()}${ext || ''}`;
}
return slugifyTitle(items[0].title, cache) + (ext || '');
diff --git a/src/util/url-origin.js b/src/util/url-origin.js
deleted file mode 100644
index e2799c2..0000000
--- a/src/util/url-origin.js
+++ /dev/null
@@ -1,9 +0,0 @@
-export function getUrlOrigin(str) {
- let origin;
- try {
- origin = new URL(str).origin;
- } catch (err) {
- // ignore
- }
- return origin && origin !== 'null' ? origin : undefined;
-}
diff --git a/src/util/url.js b/src/util/url.js
new file mode 100644
index 0000000..995476d
--- /dev/null
+++ b/src/util/url.js
@@ -0,0 +1,20 @@
+// Polyfill for URL.canParse(), which was added in later Node.js versions
+export function isURL(ref) {
+ try {
+ new URL(ref);
+ return true;
+ } catch (err) {
+ // no-op
+ }
+ return false;
+}
+
+export function getURLOrigin(str) {
+ let origin;
+ try {
+ origin = new URL(str).origin;
+ } catch (err) {
+ // ignore
+ }
+ return origin && origin !== 'null' ? origin : undefined;
+}
diff --git a/src/util/wrap-html-fragment.js b/src/util/wrap-html-fragment.js
new file mode 100644
index 0000000..d3eb895
--- /dev/null
+++ b/src/util/wrap-html-fragment.js
@@ -0,0 +1,18 @@
+/*
+ A minimal HTML shell for a DOM fragment,
+ such as the content of a XML feed entry,
+ to pass to the JSDOM constructor.
+*/
+export default function wrapHTMLFragment(item) {
+ return `
+
+
+
+ ${item.title || ''}
+
+
+ ${item.content || ''}
+
+
+ `;
+}
diff --git a/templates/default.css b/templates/default.css
index 1f5c14e..fbdc655 100644
--- a/templates/default.css
+++ b/templates/default.css
@@ -117,6 +117,7 @@ article {
}
article:not(:last-of-type) {
+ margin-bottom: 2em;
page-break-after: always;
}
@@ -136,9 +137,12 @@ article:not(:last-of-type) {
line-height: 1.1;
}
+.article__byline,
+.article__time,
.article__url {
font-style: italic;
font-size: 0.9em;
+ margin: 0;
}
/*
@@ -148,6 +152,7 @@ article:not(:last-of-type) {
.article__content img {
max-width: 100%;
+ height: auto;
display: block;
margin: 0 auto;
}
diff --git a/templates/default.html b/templates/default.html
index daefd0a..1a2d5d2 100644
--- a/templates/default.html
+++ b/templates/default.html
@@ -31,7 +31,7 @@
{{ title }}
{{ date | humandate }}
@@ -57,6 +57,18 @@ Table of Contents
{{ item.title }}
{% if item.byline %}
By {{ item.byline }}
+ {% endif %} {% if item.published %}
+
+ Published
+ {% if item.updated and item.updated !== item.published %}
+ · Updated
+
+ {% endif %}
+
{% endif %}
Source:
diff --git a/test/url-origin.test.js b/test/url-origin.test.js
deleted file mode 100644
index 63388a1..0000000
--- a/test/url-origin.test.js
+++ /dev/null
@@ -1,9 +0,0 @@
-import tape from 'tape';
-import { getUrlOrigin } from '../src/util/url-origin.js';
-
-tape('getUrlOrigin', t => {
- t.equal(getUrlOrigin('invalid'), undefined);
- t.equal(getUrlOrigin('file:///Users/myuser/'), undefined);
- t.equal(getUrlOrigin('https://github.com/user/repo'), 'https://github.com');
- t.end();
-});
diff --git a/test/url.test.js b/test/url.test.js
new file mode 100644
index 0000000..029a2f7
--- /dev/null
+++ b/test/url.test.js
@@ -0,0 +1,16 @@
+import tape from 'tape';
+import { getURLOrigin, isURL } from '../src/util/url.js';
+
+tape('getURLOrigin', t => {
+ t.equal(getURLOrigin('invalid'), undefined);
+ t.equal(getURLOrigin('file:///Users/myuser/'), undefined);
+ t.equal(getURLOrigin('https://github.com/user/repo'), 'https://github.com');
+ t.end();
+});
+
+tape('isURL', t => {
+ t.equal(isURL('invalid'), false);
+ t.equal(isURL('file:///Users/myuser/'), true);
+ t.equal(isURL('https://github.com/user/repo'), true);
+ t.end();
+});