-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsample_verification.py
More file actions
50 lines (37 loc) · 1.28 KB
/
sample_verification.py
File metadata and controls
50 lines (37 loc) · 1.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 21 11:25:01 2024
"""
import pandas as pd
import geopandas as gpd
import argparse
# initialize argument parser
ap = argparse.ArgumentParser()
# set up arguments
ap.add_argument("-i", "--input", required=True,
help="Path to the geopackage containing toponym "
"points with LAU and NUTS 3 codes.")
ap.add_argument("-o", "--output", required=True,
help="Path to output Excel.")
# parse arguments
args = vars(ap.parse_args())
# read data to be sampled
print("[INFO] - Reading in geopackage....")
df = gpd.read_file(args['input'])
# reduce column clutter
df = df[['origin', 'gc_address', 'NUTS_ID_2013', 'NUTS_ID_2016',
'NUTS_ID_2021', 'count']]
# convert count to numeric
df['count'] = pd.to_numeric(df['count'], downcast='integer', errors='coerce')
# drop rows without any counts
df = df[df['count'] >= 1]
# drop rows that are outside NUTS regions (e.g. in Russia, USA)
df = df.dropna(subset=['NUTS_ID_2013', 'NUTS_ID_2016', 'NUTS_ID_2021'],
how='all')
# get a sample of 1000
sample = df.sample(n=1000, weights=df['count'])
# save to excel
print("[INFO] - Saving the weighted sample of 1000 points...")
sample.to_excel(args['output'])
print("[INFO] - ... done!")