diff --git a/Gemfile b/Gemfile index a50a09c..2154bee 100644 --- a/Gemfile +++ b/Gemfile @@ -1,11 +1,10 @@ source 'https://rubygems.org' -ruby "2.4.0" +ruby "2.6.10" gem 'json' -gem 'elasticsearch' +gem 'elasticsearch', '~> 8.2' gem 'rest-client' gem 'dotenv' -gem 'openactive', :git => 'git://github.com/theodi/openactive.rb' - +gem 'openactive', :git => 'https://github.com/theodi/openactive.rb' diff --git a/Gemfile.lock b/Gemfile.lock index 11233ef..cbc659d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,5 +1,5 @@ GIT - remote: git://github.com/theodi/openactive.rb + remote: https://github.com/theodi/openactive.rb revision: 17fb014a4aca6e0cf003e9da17fa8e4ef9579cac specs: openactive (0.0.1) @@ -50,7 +50,7 @@ DEPENDENCIES rest-client RUBY VERSION - ruby 2.4.0p0 + ruby 2.6.10p210 BUNDLED WITH 1.14.6 diff --git a/README.md b/README.md index ae0d333..8a7d4d9 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ This repository contains a simple demonstration of harvesting and indexing [opportunity data](http://status.openactive.io/), published as part of the [OpenActive](https://openactive.io) initiative. The example uses a set of simple Ruby scripts to drive harvesting of live data feeds and indexes the data in -[ElasticSearch](https://www.elastic.co/) which is an open source search engine. +[ElasticSearch](https://www.elastic.co/), an open source search tool. The code in this project is published under an open licence and you are free to adapt and reuse it as you see fit. @@ -14,15 +14,24 @@ setup has been successfully used to do some simple analysing and reporting on pu You will need to install: -* Ruby -- this was built and tested using Ruby 2.4.0 but should work on later rubies) +* Ruby -- this was built with Ruby 2.4.0 and updated recently with the nearest stable version Ruby 2.6.10 * Bundler -- to install other ruby dependencies * Java 1.8+ -- ElasticSearch is a java application +You can use a software verison management tool like chruby to install different versions of ruby and switch between them as needed. If chruby is installed, you can run the following to switch ruby versions and confirm you are using 2.6.10: + +``` +chruby 2.6.10 +ruby -v +``` + ## Grab this code -Clone this repo to your machine, then install the ruby dependencies: +To clone this repo to your machine and install the ruby dependencies, run the following lines in a terminal: ``` +git clone https://github.com/howaskew/openactive-es-example.git +cd openactive-es-example bundle install ``` @@ -31,8 +40,7 @@ bundle install Go to [the Elastic Search download page](https://www.elastic.co/downloads/elasticsearch) and download the zip file of the latest release. -You will need to unzip the file into the `server` sub-directory of this project. You should ignore the -main sub-directory in the zip file, you just need to extract the main project folders. +Unzip this file and copy the contents into the `server` sub-directory of the openactive-es-example directory. You should end up with a `server` directory that looks something like: @@ -41,7 +49,8 @@ server/bin server/config server/lib server/modules -...etc +server/logs +server/plugins ``` ## Update the list of datasets @@ -61,38 +70,70 @@ rake prepare:config This downloads the current list of published datasets and stores it in `config/datasets.json`. By default indexing is disabled for all datasets, you will need to edit `config/datasets.json` to switch on -whichever datasets you want to try indexing. We suggest trying using the Leisure World Colchester data as its a small +whichever datasets you want to try indexing. We suggest trying using the British Triathlon data as its a relatively small feed. So edit the following section in `config/datasets.json` so that the `index` key is `true` rather than `false`. ``` - "leisureworldcolchester.github.io": { - "title": "Leisure World Colchester Sessions", - "data_url": "https://lw-colchester-openactive.herokuapp.com", + "britishtriathlon.github.io": { + "title": "British Triathlon Events", + "data_url": "https://api.britishtriathlon.org/openactive/v1/events", "index": true } ``` ## Start ElasticSearch -We then need to startup ElasticSearch so we can configure some indexes to hold the data. In a separate terminal -window, from the project directory you can start ElasticSearch by running the following command: +We then need to startup ElasticSearch so we can configure some indexes to hold the data. + + +**Open a separate terminal window**, cd to the project directory and start ElasticSearch by running the following command: ``` rake es:start ``` -This just runs `./server/bin/elasticsearch` so you can run that directly if you prefer. +(This just runs `./server/bin/elasticsearch` so you can run that directly if you prefer.) + +Make a note of the password for the 'elastic' user and the HTTP CA certificate SHA-256 fingerprint in the output. You may have to scroll back up to see it. + +Edit the file `bin/connect.rb`. Paste in the password and certificate details from the console output and save this file. + +You can test the elasticsearch instance is running by visiting `https://localhost:9200/`. + +You may see a certifcate error alert, depending upon your browser, which you can bypass e.g. click Advanced, then select the option to Proceed. + +Log in with the user 'elastic' and the password noted earlier. -You can test its running by visiting `http://localhost:9200/`. You should see a JSON response from your local -ElasticSearch server. +If the instance is running, you should see a JSON response from your local ElasticSearch server, something like: + +``` +{ + "name" : "MacBook-Pro.local", + "cluster_name" : "elasticsearch", + "cluster_uuid" : "XZB1wYYJTX3KNltDMVA", + "version" : { + "number" : "8.2.0", + "build_flavor" : "default", + "build_type" : "tar", + "build_hash" : "b174af62e8Sf4aDDc4d25gf875e93812b9282c5", + "build_date" : "2022-04-20T10:35:10.180408517Z", + "build_snapshot" : false, + "lucene_version" : "9.1.0", + "minimum_wire_compatibility_version" : "7.17.0", + "minimum_index_compatibility_version" : "7.0.0" + }, + "tagline" : "You Know, for Search" +} +``` -You can Ctrl-C to shutdown the server at any time. But it needs to be running for the following steps. -ElasticSearch is configured via its API so you need to have an instance available. +You can Ctrl-C to shutdown the server at any time - but it needs to be running for the following steps. + +ElasticSearch is configured via its API so you need to have a irunning nstance available. ### Aside: Using a different ElasticSearch Server -The scripts all assume that they are working with an ElasticSearch instance available at `http://localhost:9200/`. +The scripts all assume that they are working with an ElasticSearch instance available at `https://localhost:9200/`. If you want to use an alternative server, then for the moment you'll need to edit the scripts in the `bin` directory to revise the following lines: @@ -104,7 +145,7 @@ See the [elasticsearch-ruby configuration](http://www.rubydoc.info/gems/elastics ## Create Elastic Search Indexes -Run the following to create the ElasticSearch indexes: +**In the original terminal window**, run the following to create the ElasticSearch indexes: ``` rake es:indexes @@ -188,45 +229,28 @@ We can now check to see that we have indexed some data. If you visit this URL: ``` -http://localhost:9200/_stats +https://localhost:9200/_stats ``` Then ElasticSearch will dump the current state of all indexes, including how many documents are in each. You can also -ask for index specific statistics. So, assuming you have indexes Leisure World Colchester, then if you visit this URL: +ask for index specific statistics. So, assuming you have indexed British Triathlon, then if you visit this URL: ``` -http://localhost:9200/oa-leisureworldcolchester.github.io/_stats +https://localhost:9200/oa-britishtriathlon.github.io/_stats ``` Then you should get stats for just that index. You should see something like this: ``` -{ - "_shards": { - ... - }, - "_all": { - ... - }, - "indices": { - "oa-leisureworldcolchester.github.io": { - "primaries": { - "docs": { - "count": 156, - "deleted": 0 - }, - ... - } - } -} +{"_shards":{"total":2,"successful":1,"failed":0},"_all":{"primaries":{"docs":{"count":1825,"deleted":0}... ``` -Which tells you that 156 records have been indexed. +Which tells you that 1825 records have been indexed. You can also visit the `_search` endpoint to see all the docs: ``` -http://localhost:9200/oa-leisureworldcolchester.github.io/_search +https://localhost:9200/oa-britishtriathlon.github.io/_search ``` We suggest reading [the ElasticSearch documentation on their search API](https://www.elastic.co/guide/en/elasticsearch/reference/current/_exploring_your_data.html) diff --git a/bin/connect.rb b/bin/connect.rb new file mode 100644 index 0000000..0f6cf89 --- /dev/null +++ b/bin/connect.rb @@ -0,0 +1,6 @@ +# Colons and uppercase/lowercase don't matter when using +# the 'ca_fingerprint' parameter +CERT_FINGERPRINT = "XXXXXXXXYYYYYYXXXXXXXYYYYYYYYYXXXXXXXXYYYYYYYYYXXXXXXXXXXXX" + +# Password for the 'elastic' user generated by Elasticsearch +ELASTIC_PASSWORD = "XXXXXXXXXXXXXXXXXXXXXX" diff --git a/bin/create-indexes.rb b/bin/create-indexes.rb index 9ee5f2f..ecd97ee 100644 --- a/bin/create-indexes.rb +++ b/bin/create-indexes.rb @@ -4,7 +4,13 @@ INDEX_PREFIX="oa" -client = Elasticsearch::Client.new +require_relative 'connect' + +client = Elasticsearch::Client.new( + host: "https://elastic:#{ELASTIC_PASSWORD}@localhost:9200", + transport_options: { ssl: { verify: false } }, + ca_fingerprint: CERT_FINGERPRINT +) #Create index template, new templates will follow this pattern template = JSON.parse( File.read( ARGV[1] ) ) diff --git a/bin/delete-index.rb b/bin/delete-index.rb index 901c59d..6643a70 100644 --- a/bin/delete-index.rb +++ b/bin/delete-index.rb @@ -4,7 +4,13 @@ INDEX_PREFIX="oa" -client = Elasticsearch::Client.new +require_relative 'connect' + +client = Elasticsearch::Client.new( + host: "https://elastic:#{ELASTIC_PASSWORD}@localhost:9200", + transport_options: { ssl: { verify: false } }, + ca_fingerprint: CERT_FINGERPRINT +) index_name = "#{INDEX_PREFIX}-#{ARGV[0]}" diff --git a/bin/delete-indexes.rb b/bin/delete-indexes.rb index 9d0db1a..cb33cbb 100644 --- a/bin/delete-indexes.rb +++ b/bin/delete-indexes.rb @@ -4,8 +4,13 @@ INDEX_PREFIX="oa" -client = Elasticsearch::Client.new +require_relative 'connect' +client = Elasticsearch::Client.new( + host: "https://elastic:#{ELASTIC_PASSWORD}@localhost:9200", + transport_options: { ssl: { verify: false } }, + ca_fingerprint: CERT_FINGERPRINT +) datasets = JSON.parse( File.read(ARGV[0]) ) datasets.keys.each do |dataset| diff --git a/bin/harvest.rb b/bin/harvest.rb index b28b22c..0114269 100644 --- a/bin/harvest.rb +++ b/bin/harvest.rb @@ -2,7 +2,13 @@ require 'bundler' Bundler.require :default -client = Elasticsearch::Client.new +require_relative 'connect' + +client = Elasticsearch::Client.new( + host: "https://elastic:#{ELASTIC_PASSWORD}@localhost:9200", + transport_options: { ssl: { verify: false } }, + ca_fingerprint: CERT_FINGERPRINT +) #Parse the config/datasets.json file datasets = JSON.parse( File.read(ARGV[0]) ) @@ -37,7 +43,7 @@ #add each item to the update, using the appropriate index #we use the unique id for the item in the feed as the document id #TODO: we should be checking the state of the item and processing deletes here - body << { index: { _index: "#{INDEX_PREFIX}-#{id}", _type: "opp", _id: item["id"] } } + body << { index: { _index: "#{INDEX_PREFIX}-#{id}", _id: item["id"] } } #here we just add the data about the item that was included in the feed #in a production application you will probably want to process this data to ensure that diff --git a/config/index-template.json b/config/index-template.json index d75c920..7be5e02 100644 --- a/config/index-template.json +++ b/config/index-template.json @@ -1,9 +1,12 @@ { - "template": "oa-*", - "mappings": { - "opp": { - "properties": { - } - } + "index_patterns": [ + "oa-*" + ], + "mappings": { + "properties": { + "name" : { + "type" : "text" + } } -} + } + }