From cb19b7686dbb05e3b73b4e56dc5d129a18002f92 Mon Sep 17 00:00:00 2001 From: James Date: Mon, 4 May 2020 13:48:28 +0100 Subject: [PATCH] SPIDERING WIP --- package.json | 1 + src/bin/spidering.js | 5 +++ src/lib/spidering.js | 87 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+) mode change 100644 => 100755 package.json create mode 100644 src/bin/spidering.js create mode 100644 src/lib/spidering.js diff --git a/package.json b/package.json old mode 100644 new mode 100755 index b6f271a..098d6de --- a/package.json +++ b/package.json @@ -20,6 +20,7 @@ "@elastic/elasticsearch": "^7.6.0", "babel-core": "^6.26.3", "babel-preset-env": "^1.7.0", + "cheerio": "^1.0.0-rc.3", "genversion": "^2.2.0", "node-fetch": "^2.6.0", "object-hash": "^2.0.3" diff --git a/src/bin/spidering.js b/src/bin/spidering.js new file mode 100644 index 0000000..92c2fc9 --- /dev/null +++ b/src/bin/spidering.js @@ -0,0 +1,5 @@ +#!/usr/bin/env node +import spider from '../lib/spidering.js'; + +spider("https://www.openactive.io/data-catalogs/data-catalog-collection.jsonld"); + diff --git a/src/lib/spidering.js b/src/lib/spidering.js new file mode 100644 index 0000000..3feb92c --- /dev/null +++ b/src/lib/spidering.js @@ -0,0 +1,87 @@ +import fetch from 'node-fetch'; +import cheerio from 'cheerio'; + + +async function spider(start_url) { + + + spider_data_catalog(start_url, []); + + +} + + +async function spider_data_catalog(url, url_history) { + try { + let res = await fetch(url); + if (!res.ok) { + throw res.status + " - " + res.statusText; + } + let json = await res.json(); + let new_url_history = [...url_history]; + new_url_history.push(url); + + if ('hasPart' in json && Array.isArray(json['hasPart'])) { + for (var idx in json['hasPart']) { + // TODO If I take these await's out I start seeing random errors loading actual data sets. + // 403 on legendonlineservices.co.uk. Rate Limited? + await spider_data_catalog(json['hasPart'][idx], new_url_history); + } + } + + if ('dataset' in json && Array.isArray(json['dataset'])) { + for (var idx in json['dataset']) { + // TODO If I take these await's out I start seeing random errors loading actual data sets. + // 403 on legendonlineservices.co.uk. Rate Limited? + await spider_data_set(json['dataset'][idx], new_url_history); + } + } + } catch(error) { + console.error("ERROR spider_data_catalog"); + console.error(url_history); + console.error(url); + console.error(error); + } + +} + +async function spider_data_set(url, url_history) { + try { + + let res = await fetch(url); + if (!res.ok) { + throw res.status + " - " + res.statusText; + } + let text = await res.text(); + let $ = await cheerio.load(text); + let json_ld = $('script[type="application/ld+json"]').html(); + let json = JSON.parse(json_ld); + + let out = { + 'url': json['url'], + 'name': json['name'], + 'data-urls': {} + } + + if ('distribution' in json && Array.isArray(json['distribution'])) { + for (var idx in json['distribution']) { + out['data-urls'][json['distribution'][idx]['name']] = json['distribution'][idx]['contentUrl']; + } + } + + console.log(out); + } catch(error) { + console.error("ERROR spider_data_set"); + console.error(url_history); + console.error(url); + console.error(error); + } +} + +export { + spider, + spider_data_catalog, + spider_data_set, +}; + +export default spider; \ No newline at end of file