Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package.json
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"@elastic/elasticsearch": "^7.6.0",
"babel-core": "^6.26.3",
"babel-preset-env": "^1.7.0",
"cheerio": "^1.0.0-rc.3",
"genversion": "^2.2.0",
"node-fetch": "^2.6.0",
"object-hash": "^2.0.3"
Expand Down
5 changes: 5 additions & 0 deletions src/bin/spidering.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env node
import spider from '../lib/spidering.js';

spider("https://www.openactive.io/data-catalogs/data-catalog-collection.jsonld");

87 changes: 87 additions & 0 deletions src/lib/spidering.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import fetch from 'node-fetch';
import cheerio from 'cheerio';


async function spider(start_url) {


spider_data_catalog(start_url, []);


}


async function spider_data_catalog(url, url_history) {
try {
let res = await fetch(url);
if (!res.ok) {
throw res.status + " - " + res.statusText;
}
let json = await res.json();
let new_url_history = [...url_history];
new_url_history.push(url);

if ('hasPart' in json && Array.isArray(json['hasPart'])) {
for (var idx in json['hasPart']) {
// TODO If I take these await's out I start seeing random errors loading actual data sets.
// 403 on legendonlineservices.co.uk. Rate Limited?
await spider_data_catalog(json['hasPart'][idx], new_url_history);
}
}

if ('dataset' in json && Array.isArray(json['dataset'])) {
for (var idx in json['dataset']) {
// TODO If I take these await's out I start seeing random errors loading actual data sets.
// 403 on legendonlineservices.co.uk. Rate Limited?
await spider_data_set(json['dataset'][idx], new_url_history);
}
}
} catch(error) {
console.error("ERROR spider_data_catalog");
console.error(url_history);
console.error(url);
console.error(error);
}

}

async function spider_data_set(url, url_history) {
try {

let res = await fetch(url);
if (!res.ok) {
throw res.status + " - " + res.statusText;
}
let text = await res.text();
let $ = await cheerio.load(text);
let json_ld = $('script[type="application/ld+json"]').html();
let json = JSON.parse(json_ld);

let out = {
'url': json['url'],
'name': json['name'],
'data-urls': {}
}

if ('distribution' in json && Array.isArray(json['distribution'])) {
for (var idx in json['distribution']) {
out['data-urls'][json['distribution'][idx]['name']] = json['distribution'][idx]['contentUrl'];
}
}

console.log(out);
} catch(error) {
console.error("ERROR spider_data_set");
console.error(url_history);
console.error(url);
console.error(error);
}
}

export {
spider,
spider_data_catalog,
spider_data_set,
};

export default spider;