GreenValueNet/pre_processing_funcs.py at main · fgallagher27/GreenValueNet · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""
This script contains the functions needed to combine downloads that are multiple files into one file.
It can be imported as a module into the data download and clean scripts
It contains the following functions:

    * concat_postcodes
    * concat_roads
    * make_coastline
    * match_ons_postcode

"""

import os
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from data_load_funcs import load_data_catalogue, get_file_path

cwd = Path.cwd()

def pre_processing(processed_files_lib: dict):
    """
    This function checks a dictionary of files and functions.
    It checks if the file exists, and if not executes the associated
    pre-processing function.
    """

    print("Beginning pre-processing of data...")
    # check for existing files
    for file, func in processed_files_lib.items():
        if not os.path.exists(cwd / "data" / "processed_inputs" / file):
            func()
        else:
            print(f"{file} has already been pre-processed")
    print("Pre-processing completed.")


def concat_postcodes():
    """
    This function combines the postcode csv files into one file.
    It also drops excess columns and filter for english postcodes only
    """
    print("Preparing postcode data...")
    postcode_folder = cwd / "data" / "raw_inputs" / "postcodes"

    csv_files = [f for f in os.listdir(postcode_folder) if f.endswith('.csv')]
    dfs = []
    cols = ['postcode', 'eastings', 'northings']

    # TODO refactor using ThreadPoolExecutor to speed up
    for file in csv_files:
        file_path = os.path.join(postcode_folder, file)

        if file == "Code-Point_Open_Column_Headers.csv":
            pass
        else:
            df = pd.read_csv(file_path, header=None).iloc[:, [0,2,3,4]]
            eng = df[df.iloc[:, -1] == 'E92000001']
            eng = eng.drop(eng.columns[-1], axis=1)
            dfs.append(eng)

    postcodes_full = pd.concat(dfs, ignore_index=True, axis=0)
    postcodes_full.columns = cols

    # convert to shapefile
    geometry = [Point(xy) for xy in zip(postcodes_full['eastings'], postcodes_full['northings'])]
    gdf = gpd.GeoDataFrame(postcodes_full, geometry=geometry, crs="EPSG:27700")
    gdf = gdf.to_crs("EPSG:3857").drop(columns=['eastings', 'northings'])
    gdf.to_file(
        cwd / "data" / "processed_inputs" / "postcodes_c.shp",
        driver="ESRI Shapefile"
    )


def concat_roads():
    """
    This function binds together the road shapefiles.
    It also filters for main roads and drops excess columns.
    """
    print("Preparing road data...")
    road_folder = cwd / "data" / "raw_inputs" / "roads"

    road_files = [f for f in os.listdir(road_folder) if f.endswith('RoadLink.shp')]
    main_roads = ['A Road', 'Motorway']
    cols = ['identifier', 'class', 'geometry']

    def clean_road_subset(file_name, folder, main_roads, cols):
        """
        Cleans an individual road shapefile
        """
        file_path = os.path.join(folder, file_name)
        road_shp = gpd.read_file(file_path)
        road_shp = road_shp[road_shp['class'].isin(main_roads)]
        road_shp = road_shp[cols]
        return road_shp

    partial_subset = partial(
        clean_road_subset,
        folder=road_folder,
        main_roads=main_roads,
        cols=cols
    )

    with ThreadPoolExecutor(max_workers=4) as executor:
        shps = list(
            executor.map(
                partial_subset,
                road_files
            )
        )

    road_shp_full = pd.concat(shps, ignore_index=True, axis=0)
    road_shp_full.to_file(
        cwd / "data" / "processed_inputs" / "roads_c.shp",
        driver="ESRI Shapefile"
    )


def make_coastline():
    """
    This function creates a boundary outline of the UK coastline from the regional polygons
    """
    print("Preparing coastline data...")
    regional_boundary = gpd.read_file(cwd /"data" / "raw_inputs" / "english_region_region.shp")
    england = regional_boundary.dissolve().to_crs("EPSG:3857")
    coastline = england['geometry'].boundary
    coastline.to_file(
        cwd / "data" / "processed_inputs" / "coastline.shp",
        driver="ESRI Shapefile"
    )


def match_ons_postcode():
    """
    This function maps ONS codes used in GLUD to postcodes.
    It used the ON postcode directory file to map
    """
    print("Mapping postcodes to census wards...")
    catalogue = load_data_catalogue()
    postcode_path = (
        cwd /
        "data" /
        get_file_path(catalogue, 'inputs', 'postcodes')
    )

    mapping_path = (
        cwd /
        "data" /
        get_file_path(catalogue, 'inputs', 'glud_mapping')
    )
    postcodes = gpd.read_file(postcode_path)
    mapping = pd.read_csv(mapping_path)
    mapping = mapping.loc[:, ['pcds', 'statsward']]
    mapped = postcodes.merge(mapping, left_on = "postcode", right_on = "pcds", how="left").dropna()
    mapped.drop(columns='pcds', inplace=True)
    mapped.to_file(
        cwd / "data" / "processed_inputs" / "mapped_postcodes.shp",
        driver="ESRI Shapefile"
    )


if __name__ == "__main__":
    # TODO move this to a yml file to hold parameters
    processed_files_lib = {
        "postcodes_c.shp": concat_postcodes,
        "roads_c.shp": concat_roads,
        "coastline.shp": make_coastline,
        "mapped_postcodes.shp": match_ons_postcode
    }