From af0b9217ed36381ccd35c32428c1189790c42b23 Mon Sep 17 00:00:00 2001 From: Dong Nguyen Date: Sun, 3 May 2026 14:10:04 +0700 Subject: [PATCH 1/4] v7.2.0 - Add AI Agents guides - Fix some minor issues - Update packakges - Correct type definitions - Update CI config --- .aiignore | 15 +++ .github/workflows/ci-test.yml | 12 +-- .github/workflows/codeql-analysis.yml | 2 +- .gitignore | 8 +- .npmignore | 3 + AGENTS.md | 117 ++++++++++++++++++++++ README.md | 16 ++- index.d.ts | 139 ++++++++++++++++++-------- package.json | 23 ++--- src/deno/cross-fetch.js | 2 - src/main.js | 50 ++++++++- src/utils/linker.js | 20 ++++ src/utils/normalizer.js | 75 ++++++++++++++ src/utils/parseAtomFeed.js | 35 ++++++- src/utils/parseJsonFeed.js | 28 +++++- src/utils/parseRdfFeed.js | 32 ++++++ src/utils/parseRssFeed.js | 36 ++++++- src/utils/retrieve.js | 64 +++++++++--- src/utils/xmlparser.js | 33 ++++++ 19 files changed, 619 insertions(+), 91 deletions(-) create mode 100644 .aiignore create mode 100644 AGENTS.md delete mode 100644 src/deno/cross-fetch.js diff --git a/.aiignore b/.aiignore new file mode 100644 index 0000000..c880c25 --- /dev/null +++ b/.aiignore @@ -0,0 +1,15 @@ +node_modules +coverage +coverage.lcov + +package-lock.json +pnpm-lock.yaml +bun.lock + +.env + +dist +storage + +# AI Session Files (Private Context) +.sessions diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml index 7ea3fd9..29c8f67 100755 --- a/.github/workflows/ci-test.yml +++ b/.github/workflows/ci-test.yml @@ -12,27 +12,25 @@ jobs: strategy: matrix: - node_version: [20.x, 22.x, 24.x] + node_version: [22.x, 24.x, 25.x] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: setup Node.js v${{ matrix.node_version }} - uses: actions/setup-node@v4 + uses: actions/setup-node@v6 with: node-version: ${{ matrix.node_version }} - name: run npm scripts - env: - PROXY_SERVER: ${{ secrets.PROXY_SERVER }} run: | npm install npm run lint - #npm run build --if-present + npm run build --if-present npm run test - name: cache node modules - uses: actions/cache@v4 + uses: actions/cache@v5 with: path: ~/.npm key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }} diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index a77d776..5547051 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -38,7 +38,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@v6 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.gitignore b/.gitignore index 8b34578..d417ce1 100755 --- a/.gitignore +++ b/.gitignore @@ -12,12 +12,14 @@ coverage .nyc_output yarn.lock -coverage.lcov -package-lock.json pnpm-lock.yaml +package-lock.json +deno.lock +bun.lock output.json -deno.lock bundle.cjs bundle.cjs.map + +.sessions diff --git a/.npmignore b/.npmignore index 2dfcba1..6292f33 100644 --- a/.npmignore +++ b/.npmignore @@ -4,3 +4,6 @@ coverage pnpm-lock.yaml examples test-data + +.aiignore +.sessions diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..d3bba62 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,117 @@ +# AI Agent Instructions + +Coding guidelines for AI agents working in this project. + +## Philosophy + +- Minimalism. Simple is better. KISS (Keep It Simple, Stupid). +- Clean code, easy to read, easy to delete. +- Functional Programming — pure functions, immutability, no side effects. +- MVP mindset — deliver the smallest thing that works, then iterate. + +## Security Rules (CRITICAL — no exceptions) + +- NEVER output or request .env and example.env file contents +- NEVER hardcode API credentials, secret tokens, private keys or passwords in source code +- NEVER send sensitive data to external AI services +- Follow `.aiignore` and `.gitignore` for excluded files — do not read or reference them +- When asking for help, sanitize data (replace real IDs, emails, tokens with placeholders) +- Do not log sensitive information + +## Coding Standards (Strict) +- Language: JavaScript (ESM syntax). No TypeScript. +- Style: No semicolons, single quotes, 2-space indentation. +- Respect `eslint.config.js` — do not suggest rule changes +- Patterns: + - Functional Programming only. No Classes or OOP. + - Arrow functions are preferred. + - Maximum 3 parameters per function. Use objects for more. +- Naming: camelCase for variables/functions, SNAKE_CASE for constants. +- Documentation: + - Add JSDocs before all functions and exported variables. + - Language: Use American English for all comments and JSDocs. + - Constraint: NEVER use Vietnamese or other languages in the source code. + +### Error Handling + +- Handle errors explicitly — never swallow silently +- Use try/catch with proper logging +- Return null or throw meaningful errors + +```javascript +export const send = async (params) => { + try { + const response = await ai.ask(params) + logger.info(`send() -> success: ${response.id}`) + return response + } catch (err) { + logger.error(`send() -> failed: ${err.message}`) + console.error(err) + return null + } +} +``` + +## Testing Standards + +- Write tests for critical business logic, all error cases +- Use simple test runners (node:test, bun:test, vitest) +- No complex mocking frameworks unless necessary +- Tests live alongside source: `[module].test.js` next to `[module].js` + +## Dependency Rules + +- Prefer built-in APIs over external packages +- Before adding dependency, explain: + - Why it is needed + - Alternatives considered + - Bundle size impact +- Never add dependency for trivial utilities +- Avoid packages with large dependency trees + +## Architecture Rules + +- Do NOT change existing project architecture without explicit approval +- Do NOT move or rename core modules unless requested +- Respect module boundaries +- Avoid cross-module coupling +- New modules must follow existing folder structure + +## When Making Changes + +1. Read existing patterns first +2. Follow current coding style strictly +3. Keep dependencies minimal +4. Handle errors explicitly +5. Add JSDoc comments for new functions +6. Run `npm run lint` before committing +7. Do NOT refactor unrelated code +8. Do NOT modify working code outside task scope +9. Prefer minimal diff changes +10. Preserve existing behavior unless explicitly requested + +## When in Doubt + +- Ask for clarification before generating code +- State your assumption explicitly if proceeding without confirmation +- Prefer doing less and asking over doing more and guessing + +## Git Workflow + +- Work only inside the current branch +- Do NOT create or delete branches +- Do NOT rewrite git history +- Do NOT modify commit messages +- Changes must correspond to the current issue + +## Agent References + +Reference these URLs when working on related topics: + +- Bun: https://bun.sh/llms-full.txt +- RSS Feed: https://www.rssboard.org/rss-specification +- RDF Feed: https://web.resource.org/rss/1.0/spec +- ATOM Feed: https://datatracker.ietf.org/doc/html/rfc5023 +- JSON Feed: https://www.jsonfeed.org/version/1.1/ + +--- diff --git a/README.md b/README.md index 47ef510..071f864 100755 --- a/README.md +++ b/README.md @@ -10,17 +10,23 @@ To read & normalize RSS/ATOM/JSON feed data. ## Demo -- [Give it a try!](https://extractus-demo.vercel.app/feed) +- [Give it a try!](https://extractus.pwshub.com/feed) ## Install ```bash -# npm, pnpm, yarn -npm i @extractus/feed-extractor - # bun -bun add @extractus/feed-extractor +bun add @extractus/oembed-extractor + +# npm +npm i @extractus/oembed-extractor + +# pnpm +pnpm install @extractus/oembed-extractor + +# yarn +yarn add @extractus/oembed-extractor ``` ## Usage diff --git a/index.d.ts b/index.d.ts index 60704c3..b96611d 100755 --- a/index.d.ts +++ b/index.d.ts @@ -1,95 +1,154 @@ // Type definitions +/** + * A single normalized feed entry. + */ export interface FeedEntry { - /** - * id, guid, or generated identifier for the entry - */ + /** Entry identifier (guid, id, or auto-generated) */ id: string; + /** Permalink to the entry */ link?: string; + /** Entry title */ title?: string; + /** Entry description (HTML stripped, optionally truncated) */ description?: string; + /** Publication date (ISO format or original) */ published?: string; } +/** + * Normalized feed data returned by all extract functions. + * + * Extra fields may be present if `getExtraFeedFields` or `getExtraEntryFields` are used. + */ export interface FeedData { - link?: string; + /** Feed title */ title?: string; + /** Feed link */ + link?: string; + /** Feed description */ description?: string; + /** Feed generator */ generator?: string; + /** Feed language */ language?: string; + /** Feed publication date */ published?: string; + /** List of feed entries */ entries?: Array; } +/** + * Configuration for proxy-based feed fetching. + */ export interface ProxyConfig { + /** Proxy endpoint URL; the target feed URL is appended as query param */ target?: string; - headers?: any; + /** Custom headers to send to the proxy */ + headers?: Record; } +/** + * Options for feed parsing and normalization. + */ export interface ReaderOptions { /** - * normalize feed data or keep original - * default: true + * Normalize feed data or keep original structure. + * @default true */ normalization?: boolean; /** - * convert datetime to ISO format - * default: true + * Convert dates to ISO 8601 format. + * @default true */ useISODateFormat?: boolean; /** - * to truncate description - * default: 210 + * Maximum length for entry descriptions (0 = no limit). + * @default 250 */ descriptionMaxLen?: number; /** - * fast-xml-parser options - * https://github.com/NaturalIntelligence/fast-xml-parser/blob/master/docs/v4/2.XMLparseOptions.md + * Options passed directly to fast-xml-parser. + * @see https://github.com/NaturalIntelligence/fast-xml-parser/blob/master/docs/v4/2.XMLparseOptions.md */ - xmlParserOptions?: any; + xmlParserOptions?: Record; /** - * fill in the baseurl when it does not exist in the link - * default: '' + * Base URL for resolving relative links in the feed. + * @default '' */ baseUrl?: string; /** - * merge extra feed fields in result + * Callback to extract extra fields from the raw feed data. + * Returned properties are merged into the top-level result. */ - getExtraFeedFields?: (feedData: object) => object; + getExtraFeedFields?: (feedData: Record) => Record; /** - * merge extra entry fields in result + * Callback to extract extra fields from each raw entry. + * Returned properties are merged into each entry in the result. */ - getExtraEntryFields?: (entryData: object) => object; + getExtraEntryFields?: (entryData: Record) => Record; } +/** + * Options for the HTTP fetch request when using `extract()`. + * + * Only `headers`, `proxy`, `agent`, and `signal` are used by the library. + * Other standard fetch options may be passed through to `fetch()` in non-proxy mode. + */ export interface FetchOptions { - // Definitions by: Ryan Graham - method?: "GET" | "POST" | "DELETE" | "PATCH" | "PUT" | "HEAD" | "OPTIONS" | "CONNECT"; - headers?: any; - body?: any; - mode?: "cors" | "no-cors" | "same-origin"; - credentials?: "omit" | "same-origin" | "include"; - cache?: "default" | "no-store" | "reload" | "no-cache" | "force-cache" | "only-if-cached"; - redirect?: "follow" | "error" | "manual"; - referrer?: string; - referrerPolicy?: "referrer" | "no-referrer-when-downgrade" | "origin" | "origin-when-cross-origin" | "unsafe-url"; - integrity?: any; + /** Request headers (e.g. User-Agent) */ + headers?: Record; + /** Proxy configuration to route the request through an intermediary */ proxy?: ProxyConfig; - /** - * http proxy agent - * default: null - */ + /** HTTP/HTTPS proxy agent (e.g. HttpsProxyAgent) */ agent?: object; - /** - * signal to terminate request - * default: null - */ + /** AbortSignal to cancel the request (e.g. AbortSignal.timeout()) */ signal?: object; } +/** + * Parse an XML string into normalized feed data. + * + * Automatically detects RSS 2.0, Atom, and RDF/RSS 1.0 formats. + * + * @param xml - XML feed string + * @param options - Parser options + * @returns Normalized feed data + */ export function extractFromXml(xml: string, options?: ReaderOptions): FeedData; -export function extractFromJson(json: string, options?: ReaderOptions): FeedData; +/** + * Parse a JSON Feed object (or JSON string) into normalized feed data. + * + * Accepts both a parsed JavaScript object or a JSON string. + * + * @param json - JSON Feed object or JSON string + * @param options - Parser options + * @returns Normalized feed data + */ +export function extractFromJson(json: Record | string, options?: ReaderOptions): FeedData; + +/** + * Fetch and parse a feed from a URL. + * + * Supports RSS, Atom, RDF, and JSON Feed formats. + * Content type is auto-detected from the HTTP response. + * + * @param url - Feed source URL + * @param options - Parser options + * @param fetchOptions - HTTP fetch options + * @returns Promise resolving to normalized feed data + */ export function extract(url: string, options?: ReaderOptions, fetchOptions?: FetchOptions): Promise; +/** + * @deprecated Use `extract()` instead. + * + * Fetch and parse a feed from a URL. + * + * @param url - Feed source URL + * @param options - Parser options + * @param fetchOptions - HTTP fetch options + * @returns Promise resolving to normalized feed data + */ export function read(url: string, options?: ReaderOptions, fetchOptions?: FetchOptions): Promise; diff --git a/package.json b/package.json index e1e5d95..23d8e16 100755 --- a/package.json +++ b/package.json @@ -1,5 +1,5 @@ { - "version": "7.1.7", + "version": "7.2.0", "name": "@extractus/feed-extractor", "description": "To read and normalize RSS/ATOM/JSON feed data", "homepage": "https://github.com/extractus/feed-extractor", @@ -18,12 +18,6 @@ "default": "./src/main.js" } }, - "imports": { - "cross-fetch": "./src/deno/cross-fetch.js" - }, - "browser": { - "cross-fetch": "./src/deno/cross-fetch.js" - }, "types": "./index.d.ts", "engines": { "node": ">= 20" @@ -40,17 +34,16 @@ }, "dependencies": { "@ndaidong/bellajs": "^12.0.1", - "cross-fetch": "^4.1.0", - "fast-xml-parser": "^5.2.5", + "fast-xml-parser": "^5.7.2", "html-entities": "^2.6.0" }, "devDependencies": { - "@eslint/js": "^9.34.0", - "esbuild": "^0.25.9", - "eslint": "^9.34.0", - "globals": "^16.3.0", - "https-proxy-agent": "^7.0.6", - "nock": "^14.0.10" + "@eslint/js": "^10.0.1", + "esbuild": "^0.28.0", + "eslint": "^10.3.0", + "globals": "^17.6.0", + "https-proxy-agent": "^9.0.0", + "nock": "^14.0.14" }, "keywords": [ "extractor", diff --git a/src/deno/cross-fetch.js b/src/deno/cross-fetch.js deleted file mode 100644 index d084f98..0000000 --- a/src/deno/cross-fetch.js +++ /dev/null @@ -1,2 +0,0 @@ -// cross-fetch.js -export default fetch diff --git a/src/main.js b/src/main.js index e41ba9e..2fc11c7 100755 --- a/src/main.js +++ b/src/main.js @@ -9,6 +9,12 @@ import parseRssFeed from './utils/parseRssFeed.js' import parseAtomFeed from './utils/parseAtomFeed.js' import parseRdfFeed from './utils/parseRdfFeed.js' +/** + * Normalize parser options with defaults. + * + * @param {Object} [options={}] - User-provided parser options + * @returns {Object} Normalized options with defaults applied + */ const getopt = (options = {}) => { const { normalization = true, @@ -31,10 +37,27 @@ const getopt = (options = {}) => { } } +/** + * Extract feed data from a JSON string/object. + * + * @param {Object|string} json - JSON Feed data + * @param {Object} [options={}] - Parser options + * @returns {Object} Normalized feed data + */ export const extractFromJson = (json, options = {}) => { return parseJsonFeed(json, getopt(options)) } +/** + * Extract feed data from an XML string. + * + * Automatically detects RSS 2.0, Atom, and RDF/RSS 1.0 formats. + * + * @param {string} xml - XML feed string + * @param {Object} [options={}] - Parser options + * @returns {Object} Normalized feed data + * @throws {Error} If XML is not well-formed or format is unrecognized + */ export const extractFromXml = (xml, options = {}) => { if (!validate(xml)) { throw new Error('The XML document is not well-formed') @@ -44,15 +67,31 @@ export const extractFromXml = (xml, options = {}) => { const data = xml2obj(xml, opts.xmlParserOptions) - return isRSS(data) + const result = isRSS(data) ? parseRssFeed(data, opts) : isAtom(data) ? parseAtomFeed(data, opts) : isRdf(data) ? parseRdfFeed(data, opts) : null + if (!result) { + throw new Error('Unrecognized feed format') + } + return result } +/** + * Fetch and extract feed data from a URL. + * + * Supports RSS, Atom, RDF, and JSON Feed formats. + * Automatically detects content type and dispatches to the appropriate parser. + * + * @param {string} url - Feed source URL + * @param {Object} [options={}] - Parser options (normalization, date format, etc.) + * @param {Object} [fetchOptions={}] - Fetch options (headers, proxy, agent, signal) + * @returns {Promise} Normalized feed data + * @throws {Error} On invalid URL, fetch failure, or parse failure + */ export const extract = async (url, options = {}, fetchOptions = {}) => { if (!isValidUrl(url)) { throw new Error('Input param must be a valid URL') @@ -68,6 +107,15 @@ export const extract = async (url, options = {}, fetchOptions = {}) => { return type === 'json' ? extractFromJson(json, options) : extractFromXml(text, options) } +/** + * Deprecated. Use {@link extract} instead. + * + * @param {string} url - Feed source URL + * @param {Object} [options] - Parser options + * @param {Object} [fetchOptions] - Fetch options + * @returns {Promise} Normalized feed data + * @deprecated Since v7.0. Use `extract()` instead + */ export const read = async (url, options, fetchOptions) => { console.warn('WARNING: read() is deprecated. Please use extract() instead!') return extract(url, options, fetchOptions) diff --git a/src/utils/linker.js b/src/utils/linker.js index d86e47a..090a960 100755 --- a/src/utils/linker.js +++ b/src/utils/linker.js @@ -1,5 +1,11 @@ // utils -> linker +/** + * Check if a string is a valid HTTP/HTTPS URL. + * + * @param {string} url - URL string to validate + * @returns {boolean} True if the URL is valid and uses http/https protocol + */ export const isValid = (url = '') => { try { const ourl = new URL(url) @@ -9,6 +15,13 @@ export const isValid = (url = '') => { } } +/** + * Resolve a relative URL against an absolute base URL. + * + * @param {string} fullUrl - Base absolute URL + * @param {string} relativeUrl - Relative URL to resolve + * @returns {string} Resolved absolute URL, or empty string on failure + */ export const absolutify = (fullUrl = '', relativeUrl = '') => { try { const result = new URL(relativeUrl, fullUrl) @@ -18,6 +31,7 @@ export const absolutify = (fullUrl = '', relativeUrl = '') => { } } +/** @type {string[]} Known tracking query param keys to strip from URLs */ const blacklistKeys = [ 'CNDID', '__twitter_impression', @@ -78,6 +92,12 @@ const blacklistKeys = [ 'pk_campaign', ] +/** + * Remove known tracking parameters and hash fragment from a URL. + * + * @param {string} url - URL to purify + * @returns {string|null} Purified URL string, or null on failure + */ export const purify = (url) => { try { const pureUrl = new URL(url) diff --git a/src/utils/normalizer.js b/src/utils/normalizer.js index c5f11a7..5eff768 100644 --- a/src/utils/normalizer.js +++ b/src/utils/normalizer.js @@ -13,6 +13,12 @@ import { decode } from 'html-entities' import { absolutify, isValid as isValidUrl, purify as purifyUrl } from './linker.js' +/** + * Convert a date string to ISO 8601 format. + * + * @param {string} dstr - Date string to convert + * @returns {string} ISO date string, or empty string on failure + */ export const toISODateString = (dstr) => { try { return dstr ? (new Date(dstr)).toISOString() : '' @@ -21,17 +27,41 @@ export const toISODateString = (dstr) => { } } +/** + * Strip HTML tags and optionally truncate a description string. + * + * @param {string} val - Raw description value + * @param {number} [maxlen=0] - Maximum length (0 = no truncation) + * @returns {string} Cleaned and optionally truncated description + */ export const buildDescription = (val, maxlen = 0) => { const stripped = stripTags(String(val).trim().replace(/^$/g, '')) const text = maxlen > 0 ? truncate(stripped, maxlen) : stripped return text.replace(/\n+/g, ' ') } +/** + * Extract text content from a parsed XML node. + * + * Handles multiple known property shapes: `_text`, `#text`, `_cdata`, `$t`. + * + * @param {*} val - Value to extract text from + * @returns {string} Decoded and trimmed text content + */ export const getText = (val) => { const txt = isObject(val) ? (val._text || val['#text'] || val._cdata || val.$t) : val return txt ? decode(String(txt).trim()) : '' } +/** + * Extract a URL link from a parsed XML node. + * + * Supports multiple link formats: string, `href`, `@_href`, `@_url`, `_attributes.href`. + * + * @param {*} val - Link value (string, object, or array) + * @param {string|Object} [id=''] - GUID object or string for fallback URL + * @returns {string} Extracted URL string + */ export const getLink = (val = [], id = '') => { if (isObject(id) && hasProperty(id, '@_isPermaLink') && id['@_isPermaLink'] === 'true') { return getText(id) @@ -57,6 +87,16 @@ export const getLink = (val = [], id = '') => { return url ? url : isValidUrl(id) ? id : '' } +/** + * Extract a purified absolute URL from feed entry data. + * + * Will strip tracking params via `purify` and resolve relative URLs via `absolutify`. + * + * @param {*} url - Link value from feed entry + * @param {string} [id=''] - Fallback identifier URL + * @param {string} [baseUrl=''] - Base URL for resolving relative links + * @returns {string} Purified absolute URL string + */ export const getPureUrl = (url, id = '', baseUrl) => { const link = getLink(url, id) const pu = purifyUrl(link) @@ -68,12 +108,34 @@ export const getPureUrl = (url, id = '', baseUrl) => { : '' } +/** + * Generate a consistent hash from a string. + * + * @param {string} str - Input string + * @returns {string} Base-36 encoded hash + */ const hash = (str) => Math.abs(str.split('').reduce((s, c) => Math.imul(31, s) + c.charCodeAt(0) | 0, 0)).toString(36) +/** + * Generate a stable entry ID from identifier, URL, and publication date. + * + * Falls back to a hash-based ID when no explicit identifier is available. + * + * @param {*} id - Entry identifier (guid/id) value + * @param {string} url - Entry URL + * @param {string} pubDate - Publication date string + * @returns {string} Resolved entry ID + */ export const getEntryId = (id, url, pubDate) => { return id ? getText(id) : hash(getPureUrl(url)) + '-' + (new Date(pubDate)).getTime() } +/** + * Extract enclosure metadata from a parsed XML node. + * + * @param {Object} val - Enclosure object with `@_url`, `@_type`, `@_length` + * @returns {Object|null} Enclosure object `{ url, type, length }`, or null + */ export const getEnclosure = (val) => { const url = hasProperty(val, '@_url') ? val['@_url'] : '' const type = hasProperty(val, '@_type') ? val['@_type'] : '' @@ -87,6 +149,12 @@ export const getEnclosure = (val) => { } } +/** + * Build a category object from a parsed XML node. + * + * @param {*} v - Category value (string or object) + * @returns {Object|string} Category object `{ text, domain }` or raw string + */ const getCategory = (v) => { return isObject(v) ? { @@ -96,6 +164,13 @@ const getCategory = (v) => { : v } +/** + * Normalize optional feed/entry tags (source, category, enclosure). + * + * @param {*} val - Raw tag value + * @param {string} key - Tag name + * @returns {*} Normalized tag value + */ export const getOptionalTags = (val, key) => { if (key === 'source') { return { diff --git a/src/utils/parseAtomFeed.js b/src/utils/parseAtomFeed.js index 99435da..94df305 100644 --- a/src/utils/parseAtomFeed.js +++ b/src/utils/parseAtomFeed.js @@ -13,6 +13,13 @@ import { getEntryId } from './normalizer.js' +/** + * Transform a single Atom entry into a normalized entry object. + * + * @param {Object} item - Raw Atom entry from parsed XML + * @param {Object} options - Parser options + * @returns {Object} Normalized entry with id, title, link, published, description + */ const transform = (item, options) => { const { useISODateFormat, @@ -51,6 +58,15 @@ const transform = (item, options) => { } } +/** + * Flatten raw Atom feed data without normalization. + * + * Preserves original structure while cleaning text and links. + * + * @param {Object} feed - Raw Atom feed data + * @param {string} baseUrl - Base URL for resolving relative links + * @returns {Object} Feed data with cleaned entries + */ const flatten = (feed, baseUrl) => { const { id, @@ -91,6 +107,16 @@ const flatten = (feed, baseUrl) => { return output } +/** + * Parse and normalize Atom feed data into a standard structure. + * + * When `normalization` is false, returns flattened raw data instead. + * Extracts language from `xml:lang` attribute when present. + * + * @param {Object} data - Parsed Atom XML object + * @param {Object} [options={}] - Parser options + * @returns {Object} Normalized feed object with entries array + */ const parseAtom = (data, options = {}) => { const { normalization, @@ -110,7 +136,7 @@ const parseAtom = (data, options = {}) => { link = '', subtitle = '', generator = '', - language = '', + language = feedData.language || feedData['@_xml:lang'] || '', updated = '', entry: item = [], } = feedData @@ -135,6 +161,13 @@ const parseAtom = (data, options = {}) => { } } +/** + * Parse Atom feed data from a parsed XML object. + * + * @param {Object} data - Parsed Atom XML object + * @param {Object} [options={}] - Parser options + * @returns {Object} Normalized or flattened feed data + */ export default (data, options = {}) => { return parseAtom(data, options) } diff --git a/src/utils/parseJsonFeed.js b/src/utils/parseJsonFeed.js index d8258c7..1d6b063 100644 --- a/src/utils/parseJsonFeed.js +++ b/src/utils/parseJsonFeed.js @@ -13,6 +13,13 @@ import { import { absolutify, purify as purifyUrl } from './linker.js' +/** + * Transform a single JSON Feed item into a normalized entry object. + * + * @param {Object} item - Raw JSON Feed item + * @param {Object} options - Parser options + * @returns {Object} Normalized entry with id, title, link, published, description + */ const transform = (item, options) => { const { useISODateFormat, @@ -48,6 +55,16 @@ const transform = (item, options) => { } } +/** + * Parse and normalize JSON Feed data into a standard structure. + * + * When `normalization` is false, returns the raw data as-is. + * Extracts feed-level published date from `date_published` or `date_modified`. + * + * @param {Object} data - JSON Feed object + * @param {Object} options - Parser options + * @returns {Object} Normalized feed object with entries array + */ const parseJson = (data, options) => { const { normalization, @@ -62,6 +79,8 @@ const parseJson = (data, options) => { const { title = '', home_page_url: homepageUrl = '', + date_published: pubDate = '', + date_modified: modDate = '', description = '', language = '', items: item = [], @@ -76,7 +95,7 @@ const parseJson = (data, options) => { link: purifyUrl(homepageUrl) || absolutify(baseUrl, homepageUrl), description, language, - published: '', + published: pubDate || modDate, generator: '', ...extraFields, entries: items.map((item) => { @@ -85,6 +104,13 @@ const parseJson = (data, options) => { } } +/** + * Parse JSON Feed data from a JSON object. + * + * @param {Object} data - JSON Feed object + * @param {Object} [options={}] - Parser options + * @returns {Object} Normalized or raw feed data + */ export default (data, options = {}) => { return parseJson(data, options) } diff --git a/src/utils/parseRdfFeed.js b/src/utils/parseRdfFeed.js index c857a90..3d64c31 100644 --- a/src/utils/parseRdfFeed.js +++ b/src/utils/parseRdfFeed.js @@ -12,6 +12,13 @@ import { getEntryId } from './normalizer.js' +/** + * Transform a single RDF item into a normalized entry object. + * + * @param {Object} item - Raw RDF item from parsed XML + * @param {Object} options - Parser options + * @returns {Object} Normalized entry with id, title, link, published, description + */ const transform = (item, options) => { const { useISODateFormat, @@ -47,6 +54,15 @@ const transform = (item, options) => { } } +/** + * Flatten raw RDF feed data without normalization. + * + * Preserves original structure while cleaning text and links. + * + * @param {Object} feed - Raw RDF channel data + * @param {string} baseUrl - Base URL for resolving relative links + * @returns {Object} Feed data with cleaned entries + */ const flatten = (feed, baseUrl) => { const { title = '', @@ -80,6 +96,15 @@ const flatten = (feed, baseUrl) => { return output } +/** + * Parse and normalize RDF/RSS 1.0 feed data into a standard structure. + * + * When `normalization` is false, returns flattened raw data instead. + * + * @param {Object} data - Parsed RDF XML object + * @param {Object} [options={}] - Parser options + * @returns {Object} Normalized feed object with entries array + */ const parseRdf = (data, options = {}) => { const { normalization, @@ -124,6 +149,13 @@ const parseRdf = (data, options = {}) => { } } +/** + * Parse RDF/RSS 1.0 feed data from a parsed XML object. + * + * @param {Object} data - Parsed RDF XML object + * @param {Object} [options={}] - Parser options + * @returns {Object} Normalized or flattened feed data + */ export default (data, options = {}) => { return parseRdf(data, options) } diff --git a/src/utils/parseRssFeed.js b/src/utils/parseRssFeed.js index 27b5e3c..1068175 100644 --- a/src/utils/parseRssFeed.js +++ b/src/utils/parseRssFeed.js @@ -13,6 +13,13 @@ import { getEntryId } from './normalizer.js' +/** + * Transform a single RSS item into a normalized entry object. + * + * @param {Object} item - Raw RSS item from parsed XML + * @param {Object} options - Parser options + * @returns {Object} Normalized entry with id, title, link, published, description + */ const transform = (item, options) => { const { useISODateFormat, @@ -48,6 +55,15 @@ const transform = (item, options) => { } } +/** + * Flatten raw RSS feed data without normalization. + * + * Preserves original structure while cleaning text and links. + * + * @param {Object} feed - Raw RSS channel data + * @param {string} baseUrl - Base URL for resolving relative links + * @returns {Object} Feed data with cleaned entries + */ const flatten = (feed, baseUrl) => { const { title = '', @@ -69,7 +85,7 @@ const flatten = (feed, baseUrl) => { link: getPureUrl(link, id, baseUrl), } - const txtTags = 'guid description source'.split(' ') + const txtTags = 'guid description'.split(' ') txtTags.forEach((key) => { if (hasProperty(entry, key)) { @@ -80,7 +96,7 @@ const flatten = (feed, baseUrl) => { const optionalProps = 'source category enclosure author image'.split(' ') optionalProps.forEach((key) => { if (hasProperty(item, key)) { - entry[key] = getOptionalTags(item[key], key) + item[key] = getOptionalTags(item[key], key) } }) @@ -96,6 +112,15 @@ const flatten = (feed, baseUrl) => { return output } +/** + * Parse and normalize RSS 2.0 feed data into a standard structure. + * + * When `normalization` is false, returns flattened raw data instead. + * + * @param {Object} data - Parsed RSS XML object + * @param {Object} [options={}] - Parser options + * @returns {Object} Normalized feed object with entries array + */ const parseRss = (data, options = {}) => { const { normalization, @@ -139,6 +164,13 @@ const parseRss = (data, options = {}) => { } } +/** + * Parse RSS 2.0 feed data from a parsed XML object. + * + * @param {Object} data - Parsed RSS XML object + * @param {Object} [options={}] - Parser options + * @returns {Object} Normalized or flattened feed data + */ export default (data, options = {}) => { return parseRss(data, options) } diff --git a/src/utils/retrieve.js b/src/utils/retrieve.js index 4de4d12..c66f1a3 100755 --- a/src/utils/retrieve.js +++ b/src/utils/retrieve.js @@ -1,21 +1,37 @@ // utils -> retrieve -import fetch from 'cross-fetch' import { XMLParser } from 'fast-xml-parser' +/** + * Fetch feed content through a proxy endpoint. + * + * Appends the target URL as an encoded query param to the proxy target. + * Merges request headers with proxy-specific headers. + * + * @param {string} url - Feed URL to fetch + * @param {Object} [options={}] - Fetch options including proxy config, headers, agent, signal + * @returns {Promise} Fetch response object + */ const profetch = async (url, options = {}) => { - const { proxy = {}, signal = null } = options + const { proxy = {}, headers = {}, agent = null, signal = null } = options const { target, - headers = {}, + headers: proxyHeaders = {}, } = proxy const res = await fetch(target + encodeURIComponent(url), { - headers, + headers: { ...headers, ...proxyHeaders }, + agent, signal, }) return res } +/** + * Extract charset encoding from the first line of an XML document. + * + * @param {string} text - Raw XML text + * @returns {string} Detected charset or 'utf8' + */ const getCharsetFromText = (text) => { try { const firstLine = text.split('\n')[0].trim().replace('', '>') @@ -30,6 +46,17 @@ const getCharsetFromText = (text) => { } } +/** + * Fetch and detect feed content from a URL. + * + * Returns structured data indicating whether the response is XML or JSON, + * along with decoded text and content metadata. + * + * @param {string} url - Feed URL to retrieve + * @param {Object} [options={}] - Fetch options (headers, proxy, agent, signal) + * @returns {Promise} Object with `type`, `text` or `json`, `status`, `contentType` + * @throws {Error} On HTTP errors, invalid content types, or parse failures + */ export default async (url, options = {}) => { const { headers = { @@ -40,7 +67,9 @@ export default async (url, options = {}) => { signal = null, } = options - const res = proxy ? await profetch(url, { proxy, signal }) : await fetch(url, { headers, agent, signal }) + const res = proxy + ? await profetch(url, { proxy, headers, agent, signal }) + : await fetch(url, { headers, agent, signal }) const status = res.status if (status >= 400) { @@ -50,14 +79,6 @@ export default async (url, options = {}) => { const buffer = await res.arrayBuffer() const text = buffer ? Buffer.from(buffer).toString().trim() : '' - if (/(\+|\/)(xml|html)/.test(contentType)) { - const arr = contentType.split('charset=') - let charset = arr.length === 2 ? arr[1].trim() : getCharsetFromText(text) - const decoder = new TextDecoder(charset) - const xml = decoder.decode(buffer) - return { type: 'xml', text: xml.trim(), status, contentType } - } - if (/(\+|\/)json/.test(contentType)) { try { const data = JSON.parse(text) @@ -66,5 +87,22 @@ export default async (url, options = {}) => { throw new Error('Failed to convert data to JSON object') } } + + const arr = contentType.split('charset=') + let charset = arr.length === 2 ? arr[1].trim() : getCharsetFromText(text) + const decoder = new TextDecoder(charset) + const xml = decoder.decode(buffer) + + const startTokens = [ + ' xml.startsWith(x))) { + return { type: 'xml', text: xml.trim(), status, contentType } + } + throw new Error(`Invalid content type: ${contentType}`) } diff --git a/src/utils/xmlparser.js b/src/utils/xmlparser.js index a82e9fe..f6d6fac 100755 --- a/src/utils/xmlparser.js +++ b/src/utils/xmlparser.js @@ -4,22 +4,55 @@ import { hasProperty, isString } from '@ndaidong/bellajs' import { XMLValidator, XMLParser } from 'fast-xml-parser' +/** + * Check if parsed data represents an RSS 2.0 feed. + * + * @param {Object} [data={}] - Parsed XML object + * @returns {boolean} True if data has `rss.channel` structure + */ export const isRSS = (data = {}) => { return hasProperty(data, 'rss') && hasProperty(data.rss, 'channel') } +/** + * Check if parsed data represents an Atom feed. + * + * @param {Object} [data={}] - Parsed XML object + * @returns {boolean} True if data has `feed.entry` structure + */ export const isAtom = (data = {}) => { return hasProperty(data, 'feed') && hasProperty(data.feed, 'entry') } +/** + * Check if parsed data represents an RDF/RSS 1.0 feed. + * + * @param {Object} [data={}] - Parsed XML object + * @returns {boolean} True if data has `rdf:RDF.channel` structure + */ export const isRdf = (data = {}) => { return hasProperty(data, 'rdf:RDF') && hasProperty(data['rdf:RDF'], 'channel') } +/** + * Validate whether an XML string is well-formed. + * + * @param {string} xml - XML string to validate + * @returns {boolean} True if XML is well-formed + */ export const validate = (xml) => { return (!isString(xml) || !xml.length) ? false : XMLValidator.validate(xml) === true } +/** + * Parse an XML string into a JavaScript object. + * + * Uses fast-xml-parser with `ignoreAttributes: false` and `attributeNamePrefix: '@_'`. + * + * @param {string} [xml=''] - XML string to parse + * @param {Object} [extraOptions={}] - Additional parser options + * @returns {Object} Parsed JavaScript object + */ export const xml2obj = (xml = '', extraOptions = {}) => { const options = { attributeNamePrefix: '@_', From 849b5e0f4b1d40908fc9e12a505419925e310af7 Mon Sep 17 00:00:00 2001 From: Dong Nguyen Date: Sun, 3 May 2026 14:13:45 +0700 Subject: [PATCH 2/4] Stop building CJS from CI --- .github/workflows/ci-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml index 29c8f67..d6c6a5f 100755 --- a/.github/workflows/ci-test.yml +++ b/.github/workflows/ci-test.yml @@ -26,7 +26,7 @@ jobs: run: | npm install npm run lint - npm run build --if-present + #npm run build --if-present npm run test - name: cache node modules From 23d1224f5099335620cb9e606719659a914753a6 Mon Sep 17 00:00:00 2001 From: Dong Nguyen Date: Sun, 3 May 2026 14:22:07 +0700 Subject: [PATCH 3/4] Fix old issues. - Relative links not resolved with baseUrl (#135) - Atom / feed descriptions returned as objects (#137) --- src/utils/parseAtomFeed.js | 2 +- src/utils/parseRdfFeed.js | 4 ++-- src/utils/parseRssFeed.js | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/utils/parseAtomFeed.js b/src/utils/parseAtomFeed.js index 94df305..a30c025 100644 --- a/src/utils/parseAtomFeed.js +++ b/src/utils/parseAtomFeed.js @@ -150,7 +150,7 @@ const parseAtom = (data, options = {}) => { return { title: getText(title), link: getPureUrl(link, id, baseUrl), - description: subtitle, + description: getText(subtitle), language, generator, published, diff --git a/src/utils/parseRdfFeed.js b/src/utils/parseRdfFeed.js index 3d64c31..190c799 100644 --- a/src/utils/parseRdfFeed.js +++ b/src/utils/parseRdfFeed.js @@ -90,7 +90,7 @@ const flatten = (feed, baseUrl) => { const output = { ...feed, title: getText(title), - link: getPureUrl(link, baseUrl), + link: getPureUrl(link, '', baseUrl), item: isArray(item) ? entries : entries[0], } return output @@ -138,7 +138,7 @@ const parseRdf = (data, options = {}) => { return { title: getText(title), link: getPureUrl(link, '', baseUrl), - description, + description: getText(description), language, generator, published, diff --git a/src/utils/parseRssFeed.js b/src/utils/parseRssFeed.js index 1068175..247120a 100644 --- a/src/utils/parseRssFeed.js +++ b/src/utils/parseRssFeed.js @@ -106,7 +106,7 @@ const flatten = (feed, baseUrl) => { const output = { ...feed, title: getText(title), - link: getPureUrl(link, baseUrl), + link: getPureUrl(link, '', baseUrl), item: isArray(item) ? entries : entries[0], } return output @@ -153,7 +153,7 @@ const parseRss = (data, options = {}) => { return { title: getText(title), link: getPureUrl(link, '', baseUrl), - description, + description: getText(description), language, generator, published, From 373a55f399517d1a7ae17bc46e0fdbcdb9558c20 Mon Sep 17 00:00:00 2001 From: Dong Nguyen Date: Sun, 3 May 2026 14:30:01 +0700 Subject: [PATCH 4/4] Use TextDecoder over Buffer Related issue: #133 --- src/utils/retrieve.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/retrieve.js b/src/utils/retrieve.js index c66f1a3..daad8f4 100755 --- a/src/utils/retrieve.js +++ b/src/utils/retrieve.js @@ -77,7 +77,7 @@ export default async (url, options = {}) => { } const contentType = res.headers.get('content-type') const buffer = await res.arrayBuffer() - const text = buffer ? Buffer.from(buffer).toString().trim() : '' + const text = buffer ? new TextDecoder().decode(buffer).trim() : '' if (/(\+|\/)json/.test(contentType)) { try {