krw/API_scraper.py at master · MwjEnde/krw · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import json
import requests
import time

api_endpoint = 'http://apiv3.iucnredlist.org/api/v3/'
token ='?token=9bb4facb6d23f48efbf424bb05c0c1ef1cf6f468393bc745d42179ac4aca5fee'

# get all the species
try:
    with open("species.json", "r") as read_file:
        print("Loading species from file.")
        species = json.load(read_file)
except:
    print("Retrieving species from API...")
    api_request = 'species/page/'
    pagination = 0
    species = []
    retrieved_all = False
    while not retrieved_all:
        print(f'    page {pagination}')
        url = api_endpoint + api_request + str(pagination) + token
        species_response = requests.get(url=url).json()
        species.extend(species_response['result'])
        if species_response['count'] < 10000:
            retrieved_all = True
            print("All retrieved.")
        pagination += 1
        # sleep to not over query the api and be locked out
        time.sleep(0.5)
    with open('species.json', 'w') as fout:
        j = json.dumps(species, indent=4)
        print(j, file=fout)
        print("Species written to file.")

try:
    with open("animals.json", "r") as read_file:
        print("Loading animals from file.")
        animals = json.load(read_file)
except:
    animals = [specie for specie in species if specie['kingdom_name'] == 'ANIMALIA']
    with open('animals.json', 'w') as fout:
        j = json.dumps(animals, indent=4)
        print(j, file=fout)
        print("Animals written to file.")

try:
    with open("mammals.json", "r") as read_file:
        print("Loading mammals from file.")
        mammals = json.load(read_file)
except:
    print("Retrieving all mammals...")
    api_request = 'comp-group/getspecies/mammals'
    url = api_endpoint + api_request + token
    mammals_result = requests.get(url=url).json()
    mammals = mammals_result['result']
    print(f"Mammals retrieved, {len(mammals)} in total.")
    print("Retrieving extensive information about the mammals...")
    skipped = []
    for i,animal in enumerate(mammals):
        taxonid = animal['taxonid']
        # general species information
        api_request = 'species/id/' + str(taxonid)
        url = api_endpoint + api_request + token
        animal_result = requests.get(url=url)
        try:
            animal_result = animal_result.json()
        except:
            print("Error in general information retrieval")
            print(taxonid)
            print(animal_result)
            skipped.append(taxonid)
            continue
        if len(animal_result['result']) < 1:
            print("Error in general information retrieval")
            print(taxonid)
            skipped.append(taxonid)
            continue
        animal.update(animal_result['result'][0])
        # countries
        api_request = 'species/countries/id/' + str(taxonid)
        url = api_endpoint + api_request + token
        animal_result = requests.get(url=url)
        try:
            animal_result = animal_result.json()
        except:
            print("Error in countries retrieval")
            print(taxonid)
            print(animal_result)
            skipped.append(taxonid)
            continue
        # threats
        api_request = 'threats/species/id/' + str(taxonid)
        url = api_endpoint + api_request + token
        animal_result = requests.get(url=url)
        try:
            animal_result = animal_result.json()
        except:
            print("Error in threat retrieval")
            print(taxonid)
            print(animal_result)
            skipped.append(taxonid)
            continue
        animal['threats'] = animal_result['result']
        # habitats
        api_request = 'habitats/species/id/' + str(taxonid)
        url = api_endpoint + api_request + token
        animal_result = requests.get(url=url)
        try:
            animal_result = animal_result.json()
        except:
            print("Error in habitat retrieval")
            print(taxonid)
            print(animal_result)
            skipped.append(taxonid)
            continue
        animal['habitats'] = animal_result['result']
        # measures
        api_request = 'measures/species/id/' + str(taxonid)
        url = api_endpoint + api_request + token
        animal_result = requests.get(url=url)
        try:
            animal_result = animal_result.json()
        except:
            print("Error in measures retrieval")
            print(taxonid)
            print(animal_result)
            skipped.append(taxonid)
            continue
        animal['measures'] = animal_result['result']
        time.sleep(0.1)
        if (i + 1) % 100 == 0:
            print(f"   {i + 1} mammals retrieved, {len(mammals)-(i + 1)} to go")
    with open('mammals.json', 'w') as fout:
        j = json.dumps(mammals, indent=4)
        print(j, file=fout)
        print("Mammals written to file.")

    print(skipped)
print(len([m for m in mammals if 'measures' in m]))

# for every animal in the api, retrieve all the different types of information
# for animal in animals:
#     taxonid = animal['taxonid']
#     # general species information
#     api_request = 'species/id/' + str(taxonid)
#     url = api_endpoint + api_request + token
#     animal_result = requests.get(url=url).json()
#     animal.update(animal_result['result'][0])
#     # threats
#     api_request = 'threats/species/id/' + str(taxonid)
#     url = api_endpoint + api_request + token
#     animal_result = requests.get(url=url).json()
#     animal['threats'] = animal_result['result']
#     # habitats
#     api_request = 'habitats/species/id/' + str(taxonid)
#     url = api_endpoint + api_request + token
#     animal_result = requests.get(url=url).json()
#     animal['habitats'] = animal_result['result']
#     # measures
#     api_request = 'measures/species/id/' + str(taxonid)
#     url = api_endpoint + api_request + token
#     animal_result = requests.get(url=url).json()
#     animal['measures'] = animal_result['result']
# with open('animals-full.json', 'w') as fout:
#     json.dump(animals, fout)
#     print("Animals written to file.")

# # convert the knowledge into triples
# for animal in knowledge:
#     speciesKnowledge = knowledge[animal]
#     triples = [animal,key,value for key,value in speciesKnowledge.items()]
#     # write triples to file, append!
#     for s,p,o in triples:
#         # write s,p,o to file

# after file writing, convert to RDF with COW