From a60056f1ff1f1329f40d2f43b10b95380d5794cb Mon Sep 17 00:00:00 2001 From: aboldguess Date: Fri, 3 Oct 2025 12:41:25 +0100 Subject: [PATCH] Allow configuring DSTL domain from defaults and scripts --- README.md | 37 +++++++++++ frontend/help.ejs | 2 +- scripts/rpi_bidfinder.sh | 78 ++++++++++++++++++---- scripts/run.sh | 135 ++++++++++++++++++++++++++++++++++++--- server/config.js | 75 ++++++++++++++++++++++ server/index.js | 11 +--- 6 files changed, 308 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index d7e15ee..f1c3499 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,43 @@ pkill -f "node server/index.js" accounts. - `ENABLE_LOG_STREAM` - set to `false` to disable the `/logs` streaming endpoint in production and avoid exposing real-time log data if not required. +- `ALLOWED_SOURCE_DOMAINS` - comma-separated list of additional hostnames that + administrators are permitted to use when defining custom feeds. The value is + merged with the built-in allow list (which already includes + `contracts.mod.uk` for the DSTL portal). + +### Allowing additional source domains + +If you see a message similar to `Search URL rejected: Host "contracts.mod.uk" +is not on the allow list` while adding a feed, extend the allow list using the +`ALLOWED_SOURCE_DOMAINS` environment variable. Example commands: + +- **Linux/macOS/Raspberry Pi** + ```bash + export ALLOWED_SOURCE_DOMAINS="contracts.mod.uk" + ``` +- **Windows PowerShell** + ```powershell + $env:ALLOWED_SOURCE_DOMAINS="contracts.mod.uk" + ``` + +Restart the server after setting the variable so the new domains are loaded. +Multiple hostnames can be supplied by separating them with commas, for example +`contracts.mod.uk,example.org`. + +For convenience, the helper script below can export the variable and launch the +server in one step: + +```bash +./scripts/run.sh --allow-domain contracts.mod.uk +``` + +To configure a Raspberry Pi in a single command, the setup script forwards the +same option to the background server: + +```bash +./scripts/rpi_bidfinder.sh --allow-domain contracts.mod.uk 4000 +``` ## Scheduled cron job diff --git a/frontend/help.ejs b/frontend/help.ejs index 2f7885a..6627c4d 100644 --- a/frontend/help.ejs +++ b/frontend/help.ejs @@ -29,7 +29,7 @@
  • Parser – choose the parser from the dropdown. The default (<%= defaultParser %>) matches most Contracts Finder feeds.
  • Award sources follow the same structure but point at feeds of awarded contracts.

    -

    If you see a “URL not permitted” warning, check that both addresses start with https:// and that the hostname appears in the allowed list. Administrators can extend the allow list using the ALLOWED_SOURCE_DOMAINS environment variable (comma-separated hostnames).

    +

    If you see a “URL not permitted” warning, check that both addresses start with https:// and that the hostname appears in the allowed list. DSTL’s contracts.mod.uk domain is pre-approved, but you can extend the list with the ALLOWED_SOURCE_DOMAINS environment variable or by launching the server through ./scripts/run.sh --allow-domain example.org.

    Parser Reference

    diff --git a/scripts/rpi_bidfinder.sh b/scripts/rpi_bidfinder.sh index 6e11b74..9e4d118 100755 --- a/scripts/rpi_bidfinder.sh +++ b/scripts/rpi_bidfinder.sh @@ -1,27 +1,82 @@ #!/bin/bash -# rpi_bidfinder.sh - prepare the project on a Raspberry Pi. -# Installs Node.js, fetches dependencies, initialises the database and can -# optionally launch the server. Use the -p or --production flag to skip dev -# dependencies and supply a port number to start the server immediately. -# Usage: ./scripts/rpi_bidfinder.sh [-p|--production] [PORT] +# --------------------------------------------------------------------------- +# @file rpi_bidfinder.sh +# @description Automated setup routine tailored for Raspberry Pi deployments. +# The script installs Node.js, fetches dependencies, initialises the SQLite +# database and optionally launches the Procurement Scraper GUI. Operators can +# skip development dependencies, choose a listening port and extend the +# allowed domain list so new feeds (such as DSTL) can be registered without +# manual environment configuration. +# @usage ./scripts/rpi_bidfinder.sh [-p|--production] [PORT] +# ./scripts/rpi_bidfinder.sh --allow-domain contracts.mod.uk 4000 +# @structure +# 1. Parse command-line arguments for production mode, port and domain +# options. +# 2. Install prerequisites and project dependencies. +# 3. Initialise the database and optionally start the server using run.sh. +# --------------------------------------------------------------------------- set -e -# Parse command line options for production mode and optional port number. The -# port argument can be provided in any position as long as it is not preceded -# by -p or --production. +# Parse command line options for production mode, port and allow-list domains. PROD=0 PORT="" +declare -a RUN_ARGS=() + +usage() { + cat <<'EOF' +Usage: ./scripts/rpi_bidfinder.sh [options] [PORT] + +Options: + -p, --production Install only production dependencies. + --allow-domain Forward hostname to run.sh for allow-listing. + --allow-domains Forward comma-separated hostnames to run.sh. + -h, --help Display this help message and exit. + +Providing a bare number continues to set the server port for backwards +compatibility. Any allow-domain options are passed to run.sh so the Node.js +process receives the expanded ALLOWED_SOURCE_DOMAINS variable. +EOF +} + while [[ $# -gt 0 ]]; do case "$1" in -p|--production) PROD=1 # toggle to install only production dependencies shift ;; + --allow-domain) + if [[ -z "$2" ]]; then + echo "Error: --allow-domain requires a hostname." >&2 + usage + exit 1 + fi + RUN_ARGS+=("--allow-domain" "$2") + shift 2 + ;; + --allow-domains) + if [[ -z "$2" ]]; then + echo "Error: --allow-domains requires a comma-separated list." >&2 + usage + exit 1 + fi + RUN_ARGS+=("--allow-domains" "$2") + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; *) - PORT="$1" # treat any other argument as the desired port - shift + if [[ -z "$PORT" && "$1" =~ ^[0-9]+$ ]]; then + PORT="$1" # treat any other argument as the desired port + shift + else + echo "Error: Unknown argument '$1'." >&2 + usage + exit 1 + fi ;; esac done @@ -53,7 +108,8 @@ npm run init-db # create the SQLite database # exports PORT before starting the Node.js server. The server is started in the # background so this script exits immediately. if [[ -n "$PORT" ]]; then - ./scripts/run.sh "$PORT" + RUN_ARGS+=("--port" "$PORT") + ./scripts/run.sh "${RUN_ARGS[@]}" fi echo "Setup complete." diff --git a/scripts/run.sh b/scripts/run.sh index 0874e57..50881e7 100755 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -1,18 +1,133 @@ #!/bin/bash -# run.sh - convenience wrapper to launch the Procurement Scraper GUI server. -# Usage: ./scripts/run.sh [PORT] -# -# Accepts an optional port argument which is exported as the PORT environment -# variable so the Express application listens on that interface. The server is -# started in the background to avoid blocking the calling script. Logs continue -# to be written to logs/app.log by the application's logger. +# --------------------------------------------------------------------------- +# @file run.sh +# @description Utility wrapper that launches the Procurement Scraper GUI +# backend with sensible defaults. The script accepts optional arguments for +# setting the HTTP port and extending the domain allow list used when +# administrators register new feeds via the UI. By exporting the relevant +# environment variables before invoking Node.js, the script keeps runtime +# configuration in one place and avoids manual shell commands. +# @usage ./scripts/run.sh [PORT] [--port PORT] [--allow-domain HOST] +# ./scripts/run.sh --allow-domains host1,host2 +# @structure +# 1. Parse CLI arguments (port plus optional allow-list domains). +# 2. Export environment variables derived from the parsed arguments. +# 3. Launch the Node.js server in the background while printing guidance for +# retrieving logs. +# --------------------------------------------------------------------------- set -e -# Respect the first argument as the desired port if present. -if [ -n "$1" ]; then - export PORT="$1" +# Track requested port and any allow-list domains supplied by the operator. +PORT_ARG="" +declare -a ALLOWED_DOMAINS=() + +usage() { + cat <<'EOF' +Usage: ./scripts/run.sh [PORT] [options] + +Options: + -p, --port Explicitly set the server port. + --allow-domain Append a hostname to ALLOWED_SOURCE_DOMAINS. + --allow-domains Provide a comma-separated list of hostnames. + -h, --help Show this message and exit. + +Supplying a bare number without --port is still supported for backwards +compatibility. Hostnames are normalised to lower case and may be specified as +either raw domains or full HTTPS URLs. +EOF +} + +# Parse arguments, supporting both legacy positional ports and explicit flags. +while [[ $# -gt 0 ]]; do + case "$1" in + -p|--port) + if [[ -z "$2" ]]; then + echo "Error: --port requires a numeric argument." >&2 + usage + exit 1 + fi + PORT_ARG="$2" + shift 2 + ;; + --allow-domain) + if [[ -z "$2" ]]; then + echo "Error: --allow-domain requires a hostname." >&2 + usage + exit 1 + fi + ALLOWED_DOMAINS+=("$2") + shift 2 + ;; + --allow-domains) + if [[ -z "$2" ]]; then + echo "Error: --allow-domains requires a comma-separated list." >&2 + usage + exit 1 + fi + IFS=',' read -ra MULTI <<<"$2" + for dom in "${MULTI[@]}"; do + ALLOWED_DOMAINS+=("$dom") + done + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + if [[ -z "$PORT_ARG" && "$1" =~ ^[0-9]+$ ]]; then + PORT_ARG="$1" + shift + else + echo "Error: Unknown argument '$1'." >&2 + usage + exit 1 + fi + ;; + esac +done + +if [[ -n "$PORT_ARG" ]]; then + export PORT="$PORT_ARG" +fi + +# Merge supplied domains with any existing ALLOWED_SOURCE_DOMAINS definition. +if [[ ${#ALLOWED_DOMAINS[@]} -gt 0 ]]; then + IFS=',' read -ra EXISTING <<<"${ALLOWED_SOURCE_DOMAINS:-}" + COMBINED=() + for host in "${EXISTING[@]}"; do + COMBINED+=("$host") + done + for host in "${ALLOWED_DOMAINS[@]}"; do + COMBINED+=("$host") + done + + declare -A SEEN=() + DEDUPED=() + for host in "${COMBINED[@]}"; do + normalised=$(printf '%s' "$host" | tr '[:upper:]' '[:lower:]' | xargs) + if [[ -z "$normalised" ]]; then + continue + fi + if [[ "$normalised" == *"://"* ]]; then + normalised=${normalised#*://} + fi + normalised=${normalised%%/*} + normalised=${normalised%%:*} + + if [[ -n "$normalised" && -z "${SEEN[$normalised]}" ]]; then + SEEN[$normalised]=1 + DEDUPED+=("$normalised") + fi + done + + if [[ ${#DEDUPED[@]} -gt 0 ]]; then + ALLOWED_SOURCE_DOMAINS=$(IFS=','; echo "${DEDUPED[*]}") + export ALLOWED_SOURCE_DOMAINS + echo "Allowing additional source domains: $ALLOWED_SOURCE_DOMAINS" + fi fi # Start the Node.js backend in the background. Use nohup so the server keeps diff --git a/server/config.js b/server/config.js index dbc3245..76941f3 100644 --- a/server/config.js +++ b/server/config.js @@ -30,6 +30,66 @@ function parseSelectorList(value, fallback = []) { return [...fallback]; } +/** + * Convert environment input into a deduplicated list of domain names. The + * helper accepts a string, array or undefined value and always returns lower + * case hostnames without protocols, ports or trailing slashes so downstream + * comparisons remain consistent. Full URLs are tolerated for operator + * convenience and are reduced to their hostname component. + * + * @param {string|string[]|undefined} value - Domains supplied via environment + * @param {string[]} fallback - Default domains to include when none provided + * @returns {string[]} normalised hostname list + */ +function parseDomainList(value, fallback = []) { + const result = []; + const seen = new Set(); + + /** + * Normalise and register a hostname candidate if it has not been seen before. + * + * @param {string} candidate - Potential hostname or URL to add + */ + function register(candidate) { + if (!candidate) return; + let host = candidate.trim().toLowerCase(); + if (!host) return; + + if (host.includes('://')) { + try { + host = new URL(host).hostname.toLowerCase(); + } catch { + // Ignore values that are not valid URLs; they will be skipped. + return; + } + } + + host = host.replace(/\/$/, ''); + if (!host) return; + + // Strip any lingering port numbers (e.g. example.com:8080). + const portSeparator = host.indexOf(':'); + if (portSeparator !== -1) { + host = host.slice(0, portSeparator); + } + + if (!seen.has(host)) { + seen.add(host); + result.push(host); + } + } + + fallback.forEach(register); + + if (Array.isArray(value)) { + value.forEach(register); + } else if (typeof value === 'string' && value.trim()) { + value.split(',').forEach(register); + } + + return result; +} + /** * Convert an environment string into a positive integer, falling back to a * default when parsing fails or the provided value is out of range. @@ -96,6 +156,16 @@ const defaultNetwork = { detailConcurrency: parsePositiveInt(process.env.HTTP_DETAIL_CONCURRENCY, 4) }; +// Domains explicitly permitted when administrators add custom sources. The +// default list includes DSTL's contracts portal to prevent validation failures +// when configuring the DSTL feed. Operators can extend the allow list via the +// ALLOWED_SOURCE_DOMAINS environment variable. +const defaultAllowedDomains = ['contracts.mod.uk']; +const allowedSourceDomains = parseDomainList( + process.env.ALLOWED_SOURCE_DOMAINS, + defaultAllowedDomains +); + // Centralised configuration object used throughout the server code. Values can // be overridden via environment variables for flexibility in different // deployment environments. @@ -377,6 +447,11 @@ module.exports = { // timeouts, retry counts and detail page concurrency. network: defaultNetwork, + // Hostnames that are always permitted when defining custom sources. Values + // come from defaults above and can be extended through the + // ALLOWED_SOURCE_DOMAINS environment variable. + allowedSourceDomains, + // Legacy fields maintained for backwards compatibility. These map to the // default source so existing code and tests continue to work. scrapeUrl: defaultSource.url, diff --git a/server/index.js b/server/index.js index c5c7a92..58877c5 100644 --- a/server/index.js +++ b/server/index.js @@ -68,15 +68,10 @@ function normaliseParserKey(parserKey) { // statically configured sources but can be extended via the // ALLOWED_SOURCE_DOMAINS environment variable. const allowedDomains = new Set([ - ...Object.values(config.sources).map(s => new URL(s.base).hostname), - ...Object.values(config.awardSources).map(s => new URL(s.base).hostname) + ...Object.values(config.sources).map(s => new URL(s.base).hostname.toLowerCase()), + ...Object.values(config.awardSources).map(s => new URL(s.base).hostname.toLowerCase()), + ...config.allowedSourceDomains ]); -if (process.env.ALLOWED_SOURCE_DOMAINS) { - for (const dom of process.env.ALLOWED_SOURCE_DOMAINS.split(',')) { - const trimmed = dom.trim().toLowerCase(); - if (trimmed) allowedDomains.add(trimmed); - } -} /** * Validate whether a provided URL is permitted based on protocol and hostname.