-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathgeocoder.py
More file actions
419 lines (338 loc) · 13.3 KB
/
geocoder.py
File metadata and controls
419 lines (338 loc) · 13.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
import requests
import re
import psycopg2
from trans_util import bd09_to_wgs84
from ..database.news_server_mongo import MongoDBPipeline
from setting import POSTGRESQL_CON
mongo = MongoDBPipeline()
region = mongo.db['region']
regionObjectList = list(region.find({}))
provinceObjectList = [o for o in regionObjectList if o['level'] == 'province']
cityObjectList = [o for o in regionObjectList if o['level'] == 'city']
districtObjectList = [o for o in regionObjectList if o['level'] == 'district']
# 区域名称简写匹配规则
subPattern = r'((省)|(市)|(县)|(区)|(自治州)|(自治县)|(自治区)|(自治旗)|(林区))$'
raceList = ['满族', '回族', '达斡尔族', '蒙古族', '朝鲜族', '土家族', '苗族', '瑶族', ' 壮族', '黎族']
def region_object_in_content(content, regionObject):
if regionObject['name'] == '':
return False
if regionObject['name'] in content:
return True
if len(regionObject['name']) >= 3:
nameTrimmed = re.sub(
subPattern, '', regionObject['name'])
if nameTrimmed == '':
return False
if nameTrimmed in content:
return True
for race in raceList:
nameTrimmed = nameTrimmed.replace(race, '')
if nameTrimmed == '':
return False
if nameTrimmed in content:
return True
return False
def region_name_like_name(name, region):
if name == '' or region == '':
return False
if name == region:
return True
if len(region) >= 3 and len(name) >= 2:
nameTrimmed = re.sub(
subPattern, '', region)
if nameTrimmed == name:
return True
return False
def geocode_region(item):
'''
解析省、市、区
'''
# 新闻内容
content = item['content']
item['province_list'] = []
item['city_list'] = []
item['district_list'] = []
# 查找区域名称,添加入对应level的列表
for regionObject in regionObjectList:
if region_object_in_content(content, regionObject):
item[regionObject['level'] + '_list'].append(regionObject['name'])
# 城市对应的省份添加入省份列表
for cityName in item['city_list']:
cityObject = [o for o in cityObjectList if o['name'] == cityName][0]
provinceCode = cityObject['parent']['adcode']
provinceObject = [
o for o in provinceObjectList if o['adcode'] == provinceCode][0]
item['province_list'].append(provinceObject['name'])
# 判断区列表中的区是否合理,遍历区列表
# TODO: 区对应省份的合理性规则添加
for districtName in item['district_list']:
# 多个区可能同名,因此要当作列表进行处理。列表中包含的是名称相同的区
districtSingleObjectList = [
o for o in districtObjectList if o['name'] == districtName]
# 区存在合理标志
isReasonable = False
for districtObject in districtSingleObjectList:
cityCode = districtObject['parent']['adcode']
cityObject = [
o for o in cityObjectList if o['adcode'] == cityCode][0]
# 判断城市列表中,是否存在区对应的城市
if cityObject['name'] in item['city_list']:
# 存在则打上标签
isReasonable = True
break
else:
# 不存在则,记录区对应的城市
if item.get('debug', None) is None:
item['debug'] = [cityObject['name']]
else:
item['debug'].append(cityObject['name'])
# 如果合理,区不用变动。如果不合理,则把区列表中的区删除
if not isReasonable:
item['district_list'] = [o for o in item['district_list']
if o != districtName]
item['province_list'] = list(set(item['province_list']))
item['city_list'] = list(set(item['city_list']))
item['district_list'] = list(set(item['district_list']))
return item
def geocode_ner(item):
'''
根据NER的结果进行地理编码
1. 根据region的解析结果,去掉冗余数据
2. 按照之前的方法解析
'''
# 1. 非空校验
location_ls = list(set(item['location_ner'] + item['institute_ner']))
if len(location_ls):
# 记录地理编码结果
item['provinces'] = dict()
item['cities'] = dict()
item['locations_bd09'] = dict()
item['locations'] = dict()
# 记录地理编码信息
item['geocode_msg'] = dict()
else:
return item
# 2. 去冗余
region_ls = item['province_list'] + \
item['city_list'] + item['district_list']
# 遍历时删除,用副本遍历
for location_name in location_ls[:]:
for region_name in region_ls:
if region_name_like_name(location_name, region_name):
# 芜湖存在芜湖市和芜湖县。删除前再次判断
if location_name not in location_ls:
continue
location_ls.remove(location_name)
# 3. 地址解析
if len(item['city_list']) >= 1:
geocode_city_list = item['city_list']
elif len(item['provinces']) >= 1:
geocode_city_list = item['province_list']
else:
geocode_city_list = []
# TODO: 全国性、全省性地名的解析
if len(geocode_city_list) == 0:
return item
# 外层循环,遍历地名
for location_name in location_ls:
# 记录地名的编码结果,初始结果评价指标
geocode_msg = {}
item['geocode_msg'][location_name] = geocode_msg
result_metrics = {'comprehension': 0, 'confidence': 0}
# 第二层循环,遍历地名可能对应的城市
for geocode_city in geocode_city_list:
r = requests.get(
f'http://api.map.baidu.com/geocoding/v3/?city={geocode_city}'
f'&address={location_name}&output=json'
f'&ak=xSCBGWXWcIQ5VRg1omPYWpcgtAySsMYE'
)
json = r.json()
geocode_msg[geocode_city] = json
if (json['status'] is None) or (json['status'] != 0):
continue
result = json['result']
if result['comprehension'] >= 70 and result['confidence'] >= 20:
longitude, latitude = (
result['location']['lng'], result['location']['lat'])
# 判断是否已经存在该地点了
if location_name in item['locations_bd09']:
# 存在,则比较评价指标,决定是否替换
if (result['comprehension'] >= result_metrics['comprehension']
and
result['confidence'] >= result_metrics['confidence']):
item['locations_bd09'][location_name] = {
'longitude': longitude, 'latitude': latitude}
trans = bd09_to_wgs84(longitude, latitude)
item['locations'][location_name] = {
'longitude': trans[0], 'latitude': trans[1]}
else:
# 不存在,设置评价指标,并赋值
result_metrics['comprehension'] = result['comprehension']
result_metrics['confidence'] = result['confidence']
item['locations_bd09'][location_name] = {
'longitude': longitude, 'latitude': latitude}
trans = bd09_to_wgs84(longitude, latitude)
item['locations'][location_name] = {
'longitude': trans[0], 'latitude': trans[1]}
def scale(item, postgis=True):
'''
计算尺度、跨度
定义尺度:
0:不存在地点名词
1:区级
2:多区级、市级
3:多市级、省级
4:多省级、全国级
跨度:
locations中所有点形成的外接矩形?应该是圆形
利用PostGIS计算
'''
# 根据省、市判断尺度
if len(item['province_list']) > 1:
item['scale'] = 4
elif len(item['city_list']) > 1:
item['scale'] = 3
elif len(item['district_list']) > 1:
item['scale'] = 2
elif len(item['district_list']) == 1:
item['scale'] = 1
else:
item['scale'] = 0
# 如果不使用PostGIS计算跨度,直接返回
if not postgis:
return item
# 不存在地址 或 只有一个
if len(item['locations']) <= 1:
item['span'] = 0
else:
conn = psycopg2.connect(**POSTGRESQL_CON)
cur = conn.cursor()
query = (
f'SELECT (ST_MinimumBoundingRadius(ST_Collect(f.geom))).radius '
f'from (select geom from location where news_id = \'{item["_id"]}\') as f;'
)
cur.execute(query)
item['span'] = cur.fetchone()[0]
cur.close()
conn.close()
def geocode_origin(item, city='武汉'):
'''
原地理编码方法
从location_ner中利用高德API解析省、市
再根据市利用百度API解析相应的地址
'''
# 1. 非空校验
location_ls = list(set(item['location_ner'] + item['institute_ner']))
if len(location_ls):
# 记录地理编码结果
item['provinces'] = dict()
item['cities'] = dict()
item['locations_bd09'] = dict()
item['locations'] = dict()
# 记录地理编码信息
item['geocode_msg'] = dict()
else:
return item
# 2. 省、市解析
for location_name in item['location_ner']:
# “|”在参数中为地址分隔符。
if '|' in location_name:
continue
r = requests.get(
f'https://restapi.amap.com/v3/geocode/geo?'
f'key=6172ea799c64fdc98eed0bdd4869f3fc&'
f'address={location_name}'
)
json = r.json()
item['geocode_msg'][location_name] = json
if ('status' not in json) or (json['status'] != '1') or (json['count'] != '1'):
continue
level = json['geocodes'][0]['level']
formatted_address = json['geocodes'][0]['formatted_address']
longitude, latitude = json['geocodes'][0]['location'].split(',')
if level == '省':
item['provinces'][formatted_address] = {
'longitude': longitude,
'latitude': latitude,
}
location_ls.remove(location_name)
if level == '市':
item['cities'][formatted_address] = {
'longitude': longitude,
'latitude': latitude,
}
location_ls.remove(location_name)
# 3. 地址解析
if len(item['cities']) == 1:
# 重大错误!!!前面爬的结果全错了...
# city = list(item['cities'].items())[0]
city = list(item['cities'].keys())[0]
elif len(item['provinces']) == 1:
city = list(item['provinces'].keys())[0]
for location_name in location_ls:
r = requests.get(
f'http://api.map.baidu.com/geocoding/v3/?city={city}'
f'&address={location_name}&output=json'
f'&ak=xSCBGWXWcIQ5VRg1omPYWpcgtAySsMYE'
)
json = r.json()
item['geocode_msg'][location_name] = json
if (json['status'] is None) or (json['status'] != 0):
continue
result = json['result']
if result['comprehension'] >= 70 and result['confidence'] >= 20:
longitude, latitude = (
result['location']['lng'], result['location']['lat'])
item['locations_bd09'][location_name] = {
'longitude': longitude, 'latitude': latitude}
trans = bd09_to_wgs84(longitude, latitude)
item['locations'][location_name] = {
'longitude': trans[0], 'latitude': trans[1]}
def scale_origin(item, postgis=True):
'''
计算尺度、跨度
临时定义尺度:
0:不存在地点名词
1:市级以下
2:单市级
3:多市级、省级
4:多省级
跨度:
没有地址则为0
locations中所有点形成的外接矩形?应该是圆形
利用PostGIS计算
'''
# 如果不存在locations字段,表明没有识别出任何地点名词
if 'locations' not in item:
item['scale'] = 0
item['span'] = 0
return item
# 根据省、市判断尺度
if len(item['provinces']) > 1:
item['scale'] = 4
elif len(item['provinces']) == 1:
item['scale'] = 3
elif len(item['cities']) > 1:
item['scale'] = 3
elif len(item['cities']) == 1:
item['scale'] = 2
else:
item['scale'] = 1
# 如果不使用PostGIS计算跨度,直接返回
if not postgis:
return item
# 不存在地址 或 只有一个
if len(item['locations']) <= 1:
item['span'] = 0
else:
conn = psycopg2.connect(
host='data.piaoyang.tk', dbname='news', user='piaoyang', password='123456')
cur = conn.cursor()
query = (
f'SELECT (ST_MinimumBoundingRadius(ST_Collect(f.geom))).radius '
f'from (select geom from location where news_id = \'{item["_id"]}\') as f;'
)
cur.execute(query)
item['span'] = cur.fetchone()[0]
cur.close()
conn.close()