diff --git a/README.md b/README.md
index d7e15ee..f1c3499 100644
--- a/README.md
+++ b/README.md
@@ -107,6 +107,43 @@ pkill -f "node server/index.js"
accounts.
- `ENABLE_LOG_STREAM` - set to `false` to disable the `/logs` streaming endpoint
in production and avoid exposing real-time log data if not required.
+- `ALLOWED_SOURCE_DOMAINS` - comma-separated list of additional hostnames that
+ administrators are permitted to use when defining custom feeds. The value is
+ merged with the built-in allow list (which already includes
+ `contracts.mod.uk` for the DSTL portal).
+
+### Allowing additional source domains
+
+If you see a message similar to `Search URL rejected: Host "contracts.mod.uk"
+is not on the allow list` while adding a feed, extend the allow list using the
+`ALLOWED_SOURCE_DOMAINS` environment variable. Example commands:
+
+- **Linux/macOS/Raspberry Pi**
+ ```bash
+ export ALLOWED_SOURCE_DOMAINS="contracts.mod.uk"
+ ```
+- **Windows PowerShell**
+ ```powershell
+ $env:ALLOWED_SOURCE_DOMAINS="contracts.mod.uk"
+ ```
+
+Restart the server after setting the variable so the new domains are loaded.
+Multiple hostnames can be supplied by separating them with commas, for example
+`contracts.mod.uk,example.org`.
+
+For convenience, the helper script below can export the variable and launch the
+server in one step:
+
+```bash
+./scripts/run.sh --allow-domain contracts.mod.uk
+```
+
+To configure a Raspberry Pi in a single command, the setup script forwards the
+same option to the background server:
+
+```bash
+./scripts/rpi_bidfinder.sh --allow-domain contracts.mod.uk 4000
+```
## Scheduled cron job
diff --git a/frontend/help.ejs b/frontend/help.ejs
index 2f7885a..6627c4d 100644
--- a/frontend/help.ejs
+++ b/frontend/help.ejs
@@ -29,7 +29,7 @@
Parser – choose the parser from the dropdown. The default (<%= defaultParser %>) matches most Contracts Finder feeds.
Award sources follow the same structure but point at feeds of awarded contracts.
- If you see a “URL not permitted” warning, check that both addresses start with https:// and that the hostname appears in the allowed list. Administrators can extend the allow list using the ALLOWED_SOURCE_DOMAINS environment variable (comma-separated hostnames).
+ If you see a “URL not permitted” warning, check that both addresses start with https:// and that the hostname appears in the allowed list. DSTL’s contracts.mod.uk domain is pre-approved, but you can extend the list with the ALLOWED_SOURCE_DOMAINS environment variable or by launching the server through ./scripts/run.sh --allow-domain example.org.
Parser Reference
diff --git a/scripts/rpi_bidfinder.sh b/scripts/rpi_bidfinder.sh
index 6e11b74..9e4d118 100755
--- a/scripts/rpi_bidfinder.sh
+++ b/scripts/rpi_bidfinder.sh
@@ -1,27 +1,82 @@
#!/bin/bash
-# rpi_bidfinder.sh - prepare the project on a Raspberry Pi.
-# Installs Node.js, fetches dependencies, initialises the database and can
-# optionally launch the server. Use the -p or --production flag to skip dev
-# dependencies and supply a port number to start the server immediately.
-# Usage: ./scripts/rpi_bidfinder.sh [-p|--production] [PORT]
+# ---------------------------------------------------------------------------
+# @file rpi_bidfinder.sh
+# @description Automated setup routine tailored for Raspberry Pi deployments.
+# The script installs Node.js, fetches dependencies, initialises the SQLite
+# database and optionally launches the Procurement Scraper GUI. Operators can
+# skip development dependencies, choose a listening port and extend the
+# allowed domain list so new feeds (such as DSTL) can be registered without
+# manual environment configuration.
+# @usage ./scripts/rpi_bidfinder.sh [-p|--production] [PORT]
+# ./scripts/rpi_bidfinder.sh --allow-domain contracts.mod.uk 4000
+# @structure
+# 1. Parse command-line arguments for production mode, port and domain
+# options.
+# 2. Install prerequisites and project dependencies.
+# 3. Initialise the database and optionally start the server using run.sh.
+# ---------------------------------------------------------------------------
set -e
-# Parse command line options for production mode and optional port number. The
-# port argument can be provided in any position as long as it is not preceded
-# by -p or --production.
+# Parse command line options for production mode, port and allow-list domains.
PROD=0
PORT=""
+declare -a RUN_ARGS=()
+
+usage() {
+ cat <<'EOF'
+Usage: ./scripts/rpi_bidfinder.sh [options] [PORT]
+
+Options:
+ -p, --production Install only production dependencies.
+ --allow-domain
Forward hostname to run.sh for allow-listing.
+ --allow-domains Forward comma-separated hostnames to run.sh.
+ -h, --help Display this help message and exit.
+
+Providing a bare number continues to set the server port for backwards
+compatibility. Any allow-domain options are passed to run.sh so the Node.js
+process receives the expanded ALLOWED_SOURCE_DOMAINS variable.
+EOF
+}
+
while [[ $# -gt 0 ]]; do
case "$1" in
-p|--production)
PROD=1 # toggle to install only production dependencies
shift
;;
+ --allow-domain)
+ if [[ -z "$2" ]]; then
+ echo "Error: --allow-domain requires a hostname." >&2
+ usage
+ exit 1
+ fi
+ RUN_ARGS+=("--allow-domain" "$2")
+ shift 2
+ ;;
+ --allow-domains)
+ if [[ -z "$2" ]]; then
+ echo "Error: --allow-domains requires a comma-separated list." >&2
+ usage
+ exit 1
+ fi
+ RUN_ARGS+=("--allow-domains" "$2")
+ shift 2
+ ;;
+ -h|--help)
+ usage
+ exit 0
+ ;;
*)
- PORT="$1" # treat any other argument as the desired port
- shift
+ if [[ -z "$PORT" && "$1" =~ ^[0-9]+$ ]]; then
+ PORT="$1" # treat any other argument as the desired port
+ shift
+ else
+ echo "Error: Unknown argument '$1'." >&2
+ usage
+ exit 1
+ fi
;;
esac
done
@@ -53,7 +108,8 @@ npm run init-db # create the SQLite database
# exports PORT before starting the Node.js server. The server is started in the
# background so this script exits immediately.
if [[ -n "$PORT" ]]; then
- ./scripts/run.sh "$PORT"
+ RUN_ARGS+=("--port" "$PORT")
+ ./scripts/run.sh "${RUN_ARGS[@]}"
fi
echo "Setup complete."
diff --git a/scripts/run.sh b/scripts/run.sh
index 0874e57..50881e7 100755
--- a/scripts/run.sh
+++ b/scripts/run.sh
@@ -1,18 +1,133 @@
#!/bin/bash
-# run.sh - convenience wrapper to launch the Procurement Scraper GUI server.
-# Usage: ./scripts/run.sh [PORT]
-#
-# Accepts an optional port argument which is exported as the PORT environment
-# variable so the Express application listens on that interface. The server is
-# started in the background to avoid blocking the calling script. Logs continue
-# to be written to logs/app.log by the application's logger.
+# ---------------------------------------------------------------------------
+# @file run.sh
+# @description Utility wrapper that launches the Procurement Scraper GUI
+# backend with sensible defaults. The script accepts optional arguments for
+# setting the HTTP port and extending the domain allow list used when
+# administrators register new feeds via the UI. By exporting the relevant
+# environment variables before invoking Node.js, the script keeps runtime
+# configuration in one place and avoids manual shell commands.
+# @usage ./scripts/run.sh [PORT] [--port PORT] [--allow-domain HOST]
+# ./scripts/run.sh --allow-domains host1,host2
+# @structure
+# 1. Parse CLI arguments (port plus optional allow-list domains).
+# 2. Export environment variables derived from the parsed arguments.
+# 3. Launch the Node.js server in the background while printing guidance for
+# retrieving logs.
+# ---------------------------------------------------------------------------
set -e
-# Respect the first argument as the desired port if present.
-if [ -n "$1" ]; then
- export PORT="$1"
+# Track requested port and any allow-list domains supplied by the operator.
+PORT_ARG=""
+declare -a ALLOWED_DOMAINS=()
+
+usage() {
+ cat <<'EOF'
+Usage: ./scripts/run.sh [PORT] [options]
+
+Options:
+ -p, --port Explicitly set the server port.
+ --allow-domain Append a hostname to ALLOWED_SOURCE_DOMAINS.
+ --allow-domains Provide a comma-separated list of hostnames.
+ -h, --help Show this message and exit.
+
+Supplying a bare number without --port is still supported for backwards
+compatibility. Hostnames are normalised to lower case and may be specified as
+either raw domains or full HTTPS URLs.
+EOF
+}
+
+# Parse arguments, supporting both legacy positional ports and explicit flags.
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ -p|--port)
+ if [[ -z "$2" ]]; then
+ echo "Error: --port requires a numeric argument." >&2
+ usage
+ exit 1
+ fi
+ PORT_ARG="$2"
+ shift 2
+ ;;
+ --allow-domain)
+ if [[ -z "$2" ]]; then
+ echo "Error: --allow-domain requires a hostname." >&2
+ usage
+ exit 1
+ fi
+ ALLOWED_DOMAINS+=("$2")
+ shift 2
+ ;;
+ --allow-domains)
+ if [[ -z "$2" ]]; then
+ echo "Error: --allow-domains requires a comma-separated list." >&2
+ usage
+ exit 1
+ fi
+ IFS=',' read -ra MULTI <<<"$2"
+ for dom in "${MULTI[@]}"; do
+ ALLOWED_DOMAINS+=("$dom")
+ done
+ shift 2
+ ;;
+ -h|--help)
+ usage
+ exit 0
+ ;;
+ *)
+ if [[ -z "$PORT_ARG" && "$1" =~ ^[0-9]+$ ]]; then
+ PORT_ARG="$1"
+ shift
+ else
+ echo "Error: Unknown argument '$1'." >&2
+ usage
+ exit 1
+ fi
+ ;;
+ esac
+done
+
+if [[ -n "$PORT_ARG" ]]; then
+ export PORT="$PORT_ARG"
+fi
+
+# Merge supplied domains with any existing ALLOWED_SOURCE_DOMAINS definition.
+if [[ ${#ALLOWED_DOMAINS[@]} -gt 0 ]]; then
+ IFS=',' read -ra EXISTING <<<"${ALLOWED_SOURCE_DOMAINS:-}"
+ COMBINED=()
+ for host in "${EXISTING[@]}"; do
+ COMBINED+=("$host")
+ done
+ for host in "${ALLOWED_DOMAINS[@]}"; do
+ COMBINED+=("$host")
+ done
+
+ declare -A SEEN=()
+ DEDUPED=()
+ for host in "${COMBINED[@]}"; do
+ normalised=$(printf '%s' "$host" | tr '[:upper:]' '[:lower:]' | xargs)
+ if [[ -z "$normalised" ]]; then
+ continue
+ fi
+ if [[ "$normalised" == *"://"* ]]; then
+ normalised=${normalised#*://}
+ fi
+ normalised=${normalised%%/*}
+ normalised=${normalised%%:*}
+
+ if [[ -n "$normalised" && -z "${SEEN[$normalised]}" ]]; then
+ SEEN[$normalised]=1
+ DEDUPED+=("$normalised")
+ fi
+ done
+
+ if [[ ${#DEDUPED[@]} -gt 0 ]]; then
+ ALLOWED_SOURCE_DOMAINS=$(IFS=','; echo "${DEDUPED[*]}")
+ export ALLOWED_SOURCE_DOMAINS
+ echo "Allowing additional source domains: $ALLOWED_SOURCE_DOMAINS"
+ fi
fi
# Start the Node.js backend in the background. Use nohup so the server keeps
diff --git a/server/config.js b/server/config.js
index dbc3245..76941f3 100644
--- a/server/config.js
+++ b/server/config.js
@@ -30,6 +30,66 @@ function parseSelectorList(value, fallback = []) {
return [...fallback];
}
+/**
+ * Convert environment input into a deduplicated list of domain names. The
+ * helper accepts a string, array or undefined value and always returns lower
+ * case hostnames without protocols, ports or trailing slashes so downstream
+ * comparisons remain consistent. Full URLs are tolerated for operator
+ * convenience and are reduced to their hostname component.
+ *
+ * @param {string|string[]|undefined} value - Domains supplied via environment
+ * @param {string[]} fallback - Default domains to include when none provided
+ * @returns {string[]} normalised hostname list
+ */
+function parseDomainList(value, fallback = []) {
+ const result = [];
+ const seen = new Set();
+
+ /**
+ * Normalise and register a hostname candidate if it has not been seen before.
+ *
+ * @param {string} candidate - Potential hostname or URL to add
+ */
+ function register(candidate) {
+ if (!candidate) return;
+ let host = candidate.trim().toLowerCase();
+ if (!host) return;
+
+ if (host.includes('://')) {
+ try {
+ host = new URL(host).hostname.toLowerCase();
+ } catch {
+ // Ignore values that are not valid URLs; they will be skipped.
+ return;
+ }
+ }
+
+ host = host.replace(/\/$/, '');
+ if (!host) return;
+
+ // Strip any lingering port numbers (e.g. example.com:8080).
+ const portSeparator = host.indexOf(':');
+ if (portSeparator !== -1) {
+ host = host.slice(0, portSeparator);
+ }
+
+ if (!seen.has(host)) {
+ seen.add(host);
+ result.push(host);
+ }
+ }
+
+ fallback.forEach(register);
+
+ if (Array.isArray(value)) {
+ value.forEach(register);
+ } else if (typeof value === 'string' && value.trim()) {
+ value.split(',').forEach(register);
+ }
+
+ return result;
+}
+
/**
* Convert an environment string into a positive integer, falling back to a
* default when parsing fails or the provided value is out of range.
@@ -96,6 +156,16 @@ const defaultNetwork = {
detailConcurrency: parsePositiveInt(process.env.HTTP_DETAIL_CONCURRENCY, 4)
};
+// Domains explicitly permitted when administrators add custom sources. The
+// default list includes DSTL's contracts portal to prevent validation failures
+// when configuring the DSTL feed. Operators can extend the allow list via the
+// ALLOWED_SOURCE_DOMAINS environment variable.
+const defaultAllowedDomains = ['contracts.mod.uk'];
+const allowedSourceDomains = parseDomainList(
+ process.env.ALLOWED_SOURCE_DOMAINS,
+ defaultAllowedDomains
+);
+
// Centralised configuration object used throughout the server code. Values can
// be overridden via environment variables for flexibility in different
// deployment environments.
@@ -377,6 +447,11 @@ module.exports = {
// timeouts, retry counts and detail page concurrency.
network: defaultNetwork,
+ // Hostnames that are always permitted when defining custom sources. Values
+ // come from defaults above and can be extended through the
+ // ALLOWED_SOURCE_DOMAINS environment variable.
+ allowedSourceDomains,
+
// Legacy fields maintained for backwards compatibility. These map to the
// default source so existing code and tests continue to work.
scrapeUrl: defaultSource.url,
diff --git a/server/index.js b/server/index.js
index c5c7a92..58877c5 100644
--- a/server/index.js
+++ b/server/index.js
@@ -68,15 +68,10 @@ function normaliseParserKey(parserKey) {
// statically configured sources but can be extended via the
// ALLOWED_SOURCE_DOMAINS environment variable.
const allowedDomains = new Set([
- ...Object.values(config.sources).map(s => new URL(s.base).hostname),
- ...Object.values(config.awardSources).map(s => new URL(s.base).hostname)
+ ...Object.values(config.sources).map(s => new URL(s.base).hostname.toLowerCase()),
+ ...Object.values(config.awardSources).map(s => new URL(s.base).hostname.toLowerCase()),
+ ...config.allowedSourceDomains
]);
-if (process.env.ALLOWED_SOURCE_DOMAINS) {
- for (const dom of process.env.ALLOWED_SOURCE_DOMAINS.split(',')) {
- const trimmed = dom.trim().toLowerCase();
- if (trimmed) allowedDomains.add(trimmed);
- }
-}
/**
* Validate whether a provided URL is permitted based on protocol and hostname.