-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathAPI_scraper.py
More file actions
176 lines (168 loc) · 6.29 KB
/
API_scraper.py
File metadata and controls
176 lines (168 loc) · 6.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import json
import requests
import time
api_endpoint = 'http://apiv3.iucnredlist.org/api/v3/'
token ='?token=9bb4facb6d23f48efbf424bb05c0c1ef1cf6f468393bc745d42179ac4aca5fee'
# get all the species
try:
with open("species.json", "r") as read_file:
print("Loading species from file.")
species = json.load(read_file)
except:
print("Retrieving species from API...")
api_request = 'species/page/'
pagination = 0
species = []
retrieved_all = False
while not retrieved_all:
print(f' page {pagination}')
url = api_endpoint + api_request + str(pagination) + token
species_response = requests.get(url=url).json()
species.extend(species_response['result'])
if species_response['count'] < 10000:
retrieved_all = True
print("All retrieved.")
pagination += 1
# sleep to not over query the api and be locked out
time.sleep(0.5)
with open('species.json', 'w') as fout:
j = json.dumps(species, indent=4)
print(j, file=fout)
print("Species written to file.")
try:
with open("animals.json", "r") as read_file:
print("Loading animals from file.")
animals = json.load(read_file)
except:
animals = [specie for specie in species if specie['kingdom_name'] == 'ANIMALIA']
with open('animals.json', 'w') as fout:
j = json.dumps(animals, indent=4)
print(j, file=fout)
print("Animals written to file.")
try:
with open("mammals.json", "r") as read_file:
print("Loading mammals from file.")
mammals = json.load(read_file)
except:
print("Retrieving all mammals...")
api_request = 'comp-group/getspecies/mammals'
url = api_endpoint + api_request + token
mammals_result = requests.get(url=url).json()
mammals = mammals_result['result']
print(f"Mammals retrieved, {len(mammals)} in total.")
print("Retrieving extensive information about the mammals...")
skipped = []
for i,animal in enumerate(mammals):
taxonid = animal['taxonid']
# general species information
api_request = 'species/id/' + str(taxonid)
url = api_endpoint + api_request + token
animal_result = requests.get(url=url)
try:
animal_result = animal_result.json()
except:
print("Error in general information retrieval")
print(taxonid)
print(animal_result)
skipped.append(taxonid)
continue
if len(animal_result['result']) < 1:
print("Error in general information retrieval")
print(taxonid)
skipped.append(taxonid)
continue
animal.update(animal_result['result'][0])
# countries
api_request = 'species/countries/id/' + str(taxonid)
url = api_endpoint + api_request + token
animal_result = requests.get(url=url)
try:
animal_result = animal_result.json()
except:
print("Error in countries retrieval")
print(taxonid)
print(animal_result)
skipped.append(taxonid)
continue
# threats
api_request = 'threats/species/id/' + str(taxonid)
url = api_endpoint + api_request + token
animal_result = requests.get(url=url)
try:
animal_result = animal_result.json()
except:
print("Error in threat retrieval")
print(taxonid)
print(animal_result)
skipped.append(taxonid)
continue
animal['threats'] = animal_result['result']
# habitats
api_request = 'habitats/species/id/' + str(taxonid)
url = api_endpoint + api_request + token
animal_result = requests.get(url=url)
try:
animal_result = animal_result.json()
except:
print("Error in habitat retrieval")
print(taxonid)
print(animal_result)
skipped.append(taxonid)
continue
animal['habitats'] = animal_result['result']
# measures
api_request = 'measures/species/id/' + str(taxonid)
url = api_endpoint + api_request + token
animal_result = requests.get(url=url)
try:
animal_result = animal_result.json()
except:
print("Error in measures retrieval")
print(taxonid)
print(animal_result)
skipped.append(taxonid)
continue
animal['measures'] = animal_result['result']
time.sleep(0.1)
if (i + 1) % 100 == 0:
print(f" {i + 1} mammals retrieved, {len(mammals)-(i + 1)} to go")
with open('mammals.json', 'w') as fout:
j = json.dumps(mammals, indent=4)
print(j, file=fout)
print("Mammals written to file.")
print(skipped)
print(len([m for m in mammals if 'measures' in m]))
# for every animal in the api, retrieve all the different types of information
# for animal in animals:
# taxonid = animal['taxonid']
# # general species information
# api_request = 'species/id/' + str(taxonid)
# url = api_endpoint + api_request + token
# animal_result = requests.get(url=url).json()
# animal.update(animal_result['result'][0])
# # threats
# api_request = 'threats/species/id/' + str(taxonid)
# url = api_endpoint + api_request + token
# animal_result = requests.get(url=url).json()
# animal['threats'] = animal_result['result']
# # habitats
# api_request = 'habitats/species/id/' + str(taxonid)
# url = api_endpoint + api_request + token
# animal_result = requests.get(url=url).json()
# animal['habitats'] = animal_result['result']
# # measures
# api_request = 'measures/species/id/' + str(taxonid)
# url = api_endpoint + api_request + token
# animal_result = requests.get(url=url).json()
# animal['measures'] = animal_result['result']
# with open('animals-full.json', 'w') as fout:
# json.dump(animals, fout)
# print("Animals written to file.")
# # convert the knowledge into triples
# for animal in knowledge:
# speciesKnowledge = knowledge[animal]
# triples = [animal,key,value for key,value in speciesKnowledge.items()]
# # write triples to file, append!
# for s,p,o in triples:
# # write s,p,o to file
# after file writing, convert to RDF with COW