Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions space2stats_api/src/space2stats_ingest/METADATA/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@ Follow these steps to create the initial STAC metadata:

## Adding a New STAC Item

To add a new STAC Item, update the Excel spreadsheet with the relevant fields, and pass your new Parquet dataset to the `link_new_item.py` script.

1. **Update Metadata File**:
- In the **Feature Catalog** sheet of `Space2Stats Metadata Content.xlsx`, add a description for each new variable in your dataset.
- Create an item id for the new set of variables, for example *world_pop_2025* or *nighttime_lights_2013*.
- Add a new entry in the **Sources** sheet if it doesn’t exist already.
> [!IMPORTANT]
> Make sure that the Item column in **Sources** corresponds to the same item id you created in the **Feature Catalog** sheet. This will be used to retrieve relevant information.
To add a new STAC Item, update the CSV metadata files in the `metadata_content/` folder, and pass your new Parquet dataset to the `link_new_item.py` script.

1. **Update Metadata CSVs**:
- In `metadata_content/Space2Stats_Metadata_Feature_Catalog.csv`, add a row for each new variable in your dataset with columns: `variable`, `description`, `nodata`, and `item`.
- Create an item id for the new set of variables, for example *world_pop* or *nighttime_lights*.
- In `metadata_content/Space2Stats_Metadata_Sources.csv`, add a new row for the item with its name, description, citation, method, resolution, and optional start/end dates.
> [!IMPORTANT]
> Make sure that the `Item` column in **Sources** corresponds to the same item id you used in the **Feature Catalog**. This is used to link variables to their source metadata.

2. **Run *link_new_item.py* script**:
- Navigate to the `METADATA` sub-directory and execute the following command:
Expand Down
Binary file not shown.
98 changes: 47 additions & 51 deletions space2stats_api/src/space2stats_ingest/METADATA/link_new_item.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import argparse
import json
import csv
import os
from datetime import datetime
from os.path import join
from typing import Dict
from typing import Dict, List

import git
import pandas as pd
import pyarrow as pa
from dateutil.parser import parse as dtparse # type: ignore[import-untyped]
from pyarrow.parquet import ParquetFile
from pystac import Asset, CatalogType, Collection, Item
from pystac.extensions.table import TableExtension
Expand Down Expand Up @@ -36,37 +36,23 @@ def get_types(parquet_file: str):
return column_types


# Function to save an updated dictionary of column types. Will not be used for now.
def save_parquet_types_to_json(parquet_file: str):
git_root = get_git_root()
json_file = join(
git_root, "space2stats_api/src/space2stats_ingest/METADATA/types.json"
)
df = pd.read_parquet(parquet_file, nrow=10)

# Get the column names and their types
column_types = {col: str(df[col].dtype) for col in df.columns}
def _read_csv(path: str) -> List[dict]:
"""Read a CSV file and return a list of row dicts."""
with open(path, newline="") as f:
return list(csv.DictReader(f))

# Save the column types to a JSON file
with open(json_file, "r+") as f:
data_types = json.load(f) # Read the existing data
data_types.update(column_types) # Update with new columns
f.seek(0) # Move to the start of the file
json.dump(data_types, f, indent=4) # Write updated data
f.truncate()

print(f"Column types saved to {json_file}")
# Function to load metadata from CSV files
def load_metadata(metadata_dir: str) -> Dict[str, object]:
feature_rows = _read_csv(
join(metadata_dir, "Space2Stats_Metadata_Feature_Catalog.csv")
)
sources = _read_csv(join(metadata_dir, "Space2Stats_Metadata_Sources.csv"))

# Build feature_catalog as a dict keyed by variable name
feature_catalog = {row["variable"]: row for row in feature_rows}

# Function to load metadata from the Excel file
def load_metadata(file: str) -> Dict[str, pd.DataFrame]:
overview = pd.read_excel(file, sheet_name="DDH Dataset", index_col="Field")
nada = pd.read_excel(file, sheet_name="NADA", index_col="Field")
feature_catalog = pd.read_excel(file, sheet_name="Feature Catalog")
sources = pd.read_excel(file, sheet_name="Sources")
return {
"overview": overview,
"nada": nada,
"feature_catalog": feature_catalog,
"sources": sources,
}
Expand All @@ -79,10 +65,10 @@ def load_existing_collection(collection_path: str) -> Collection:

# Function to create a new STAC item
def create_new_item(
sources: pd.DataFrame,
sources: List[dict],
column_types: dict,
item_id: str,
feature_catalog: pd.DataFrame,
feature_catalog: Dict[str, dict],
) -> tuple[Item, str]:
# Define geometry and bounding box (you may want to customize these)
geom = {
Expand All @@ -105,12 +91,15 @@ def create_new_item(
]

# Get metadata for the new item
try:
src_metadata = sources[sources["Item"] == item_id].iloc[0]
except IndexError:
src_metadata = None
for row in sources:
if row["Item"] == item_id:
src_metadata = row
break
if src_metadata is None:
raise IndexError(f"Item '{item_id}' not found in the metadata sources sheet")

if pd.isna(src_metadata["End Date"]):
if not src_metadata["Start Date"] or not src_metadata["End Date"]:
# Define the item
item = Item(
id=item_id,
Expand All @@ -134,13 +123,16 @@ def create_new_item(
)
else:
# Define the item with a time range
def _parse_date(val):
return dtparse(str(val).strip())

item = Item(
id=item_id,
geometry=geom,
bbox=bbox,
datetime=None,
start_datetime=src_metadata["Start Date"],
end_datetime=src_metadata["End Date"],
start_datetime=_parse_date(src_metadata["Start Date"]),
end_datetime=_parse_date(src_metadata["End Date"]),
properties={
"name": src_metadata["Name"],
"description": src_metadata["Description"],
Expand All @@ -163,7 +155,7 @@ def create_new_item(
table_extension.columns = [
{
"name": col,
"description": feature_catalog.loc[col, "description"],
"description": feature_catalog[col]["description"],
"type": dtype,
}
for col, dtype in column_types.items()
Expand Down Expand Up @@ -205,24 +197,28 @@ def main():

# Paths and metadata setup
collection_path = join(metadata_dir, "stac/space2stats-collection/collection.json")
excel_path = join(metadata_dir, "Space2Stats Metadata Content.xlsx")
column_types = get_types(input_parquet)

# Load metadata and column types
metadata = load_metadata(excel_path)
# Load metadata from CSVs
metadata = load_metadata(join(metadata_dir, "metadata_content"))
feature_catalog = metadata["feature_catalog"]

# Find item name and metadata based on column names
feature_catalog.set_index("variable", inplace=True)
try:
feature_catalog = feature_catalog.loc[column_types.keys()]
except KeyError as e:
raise KeyError(f"Column '{e}' not found in the metadata feature catalog sheet")
item_ids = feature_catalog["item"].unique()
item_id = [id for id in item_ids if id != "all"]
if len(item_id) != 1:
raise ValueError(f"Expected one item name, found {len(item_id)}")
item_id = item_id[0]
for col in column_types:
if col not in feature_catalog:
raise KeyError(
f"Column '{col}' not found in the metadata feature catalog sheet"
)
item_ids = {feature_catalog[col]["item"] for col in column_types}
item_ids.discard("all")
if len(item_ids) != 1:
raise ValueError(f"Expected one item name, found {len(item_ids)}")
item_id = item_ids.pop()

# Filter feature_catalog to only columns in the parquet
feature_catalog = {
col: feature_catalog[col] for col in column_types if col in feature_catalog
}

# Load existing collection
collection = load_existing_collection(collection_path)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Field,Value
Title,Space2Stats Database
Description,A global dataset of geospatial variables at the grid level (hexagon H3 level 6).
TTL,Ben Stewart
Business Unit,DECSC
Collaborator,Andres Chamorro
Classification,Public
License,Creative Commons Attribution 4.0
,
Data Resource,
Classification,Public
Resource URL,https://space2stats.ds.io/docs
Resource Title,Space2Stats API
Description Resource,"This database contains geospatial statistics for the entire globe standardized to a hexagonal grid. The spatial unit of the dataset is the H3 level 6 (approximately 36 sq. km. per cell). The variables cover a wide range of geographic themes relevant to international development, including demographic, socio-economic, environmental, climate, and infrastructure. An API enables users to query, access, and aggregate statistics from the Space2Stats database. The purpose of this API is to facilitate the generation of sub-national geospatial aggregates for any administrative boundary set."
Release Note,test
Release Date,"45,526"
First Published Date,"45,526"
Maintenance and Update Frequency,No fixed schedule
Maintenance Note,
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
variable,description,nodata,item
hex_id,H3 unique identifier,,all
ogc_fid,Feature unique identifier,,all
sum_built_area_m_1975,Total built area (m2) in 1975,,builtarea_ghsl
sum_built_area_m_1980,Total built area (m2) in 1980,,builtarea_ghsl
sum_built_area_m_1985,Total built area (m2) in 1985,,builtarea_ghsl
sum_built_area_m_1990,Total built area (m2) in 1990,,builtarea_ghsl
sum_built_area_m_1995,Total built area (m2) in 1995,,builtarea_ghsl
sum_built_area_m_2000,Total built area (m2) in 2000,,builtarea_ghsl
sum_built_area_m_2005,Total built area (m2) in 2005,,builtarea_ghsl
sum_built_area_m_2010,Total built area (m2) in 2010,,builtarea_ghsl
sum_built_area_m_2015,Total built area (m2) in 2015,,builtarea_ghsl
sum_built_area_m_2020,Total built area (m2) in 2020,,builtarea_ghsl
sum_built_area_m_2025,Total built area (m2) in 2025,,builtarea_ghsl
sum_built_area_m_2030,Total built area (m2) in 2030,,builtarea_ghsl
spi,"Standardized Precipitation Index (SPI), 6-month timescale",,climate
date,"Month, formatted as YYYY-MM-DD",,climate
cy_frequency_mean,Tropical Cyclone Frequency,,cyclones
drought_spei_1_5_rp100_mean,"Drought hazard (SPEI ≤ -1.5, 100-year return period)",,drought
fires_density_mean,Fire Density,,fires
pop,"Sum of Gridded Population, 2020",,flood_exposure_15cm_1in100
pop_flood,"Sum of population exposed to floods greater than 15 cm, 1 in 100 return period",,flood_exposure_15cm_1in100
pop_flood_pct,"Percent of population exposed to floods greater than 15 cm, 1 in 100 return period",,flood_exposure_15cm_1in100
landslide_susceptibility_mean_2023,Landslide Susceptibility Index,,landslide_susceptibility
sum_viirs_ntl_2012,Sum of VIIRS nighttlime lights brightness for 2012,,nighttime_lights
sum_viirs_ntl_2013,Sum of VIIRS nighttlime lights brightness for 2013,,nighttime_lights
sum_viirs_ntl_2014,Sum of VIIRS nighttlime lights brightness for 2014,,nighttime_lights
sum_viirs_ntl_2015,Sum of VIIRS nighttlime lights brightness for 2015,,nighttime_lights
sum_viirs_ntl_2016,Sum of VIIRS nighttlime lights brightness for 2016,,nighttime_lights
sum_viirs_ntl_2017,Sum of VIIRS nighttlime lights brightness for 2017,,nighttime_lights
sum_viirs_ntl_2018,Sum of VIIRS nighttlime lights brightness for 2018,,nighttime_lights
sum_viirs_ntl_2019,Sum of VIIRS nighttlime lights brightness for 2019,,nighttime_lights
sum_viirs_ntl_2020,Sum of VIIRS nighttlime lights brightness for 2020,,nighttime_lights
sum_viirs_ntl_2021,Sum of VIIRS nighttlime lights brightness for 2021,,nighttime_lights
sum_viirs_ntl_2022,Sum of VIIRS nighttlime lights brightness for 2022,,nighttime_lights
sum_viirs_ntl_2023,Sum of VIIRS nighttlime lights brightness for 2023,,nighttime_lights
sum_viirs_ntl_2024,Sum of VIIRS nighttlime lights brightness for 2024,,nighttime_lights
ghs_11_count,Total number of cells in very low density areas,,urbanization_ghssmod
ghs_12_count,Total number of cells in low density rural areas,,urbanization_ghssmod
ghs_13_count,Total number of cells in rural areas,,urbanization_ghssmod
ghs_21_count,Total number of cells in suburban grid cells,,urbanization_ghssmod
ghs_22_count,Total number of cells in semi-dense urban clusters,,urbanization_ghssmod
ghs_23_count,Total number of cells in dense urban clusters,,urbanization_ghssmod
ghs_30_count,Total number of cells in urban centres,,urbanization_ghssmod
ghs_total_count,Total number of cells in all categories in GHS database,,urbanization_ghssmod
ghs_11_pop,Total population in very low density areas,,urbanization_ghssmod
ghs_12_pop,Total population in low density rural areas,,urbanization_ghssmod
ghs_13_pop,Total population in rural areas,,urbanization_ghssmod
ghs_21_pop,Total population in suburban grid cells,,urbanization_ghssmod
ghs_22_pop,Total population in semi-dense urban clusters,,urbanization_ghssmod
ghs_23_pop,Total population in dense urban clusters,,urbanization_ghssmod
ghs_30_pop,Total population in urban centres,,urbanization_ghssmod
ghs_total_pop,Total population based on GHS-Pop population,,urbanization_ghssmod
sum_f_00_2025,"Total population female, ages 0 to 1, 2025",,world_pop
sum_f_01_2025,"Total population female, ages 1 to 10, 2025",,world_pop
sum_f_05_2025,"Total population female, ages 5 to 10, 2025",,world_pop
sum_f_10_2025,"Total population female, ages 10 to 15, 2025",,world_pop
sum_f_15_2025,"Total population female, ages 15 to 20, 2025",,world_pop
sum_f_20_2025,"Total population female, ages 20 to 25, 2025",,world_pop
sum_f_25_2025,"Total population female, ages 25 to 30, 2025",,world_pop
sum_f_30_2025,"Total population female, ages 30 to 35, 2025",,world_pop
sum_f_35_2025,"Total population female, ages 35 to 40, 2025",,world_pop
sum_f_40_2025,"Total population female, ages 40 to 45, 2025",,world_pop
sum_f_45_2025,"Total population female, ages 45 to 50, 2025",,world_pop
sum_f_50_2025,"Total population female, ages 50 to 55, 2025",,world_pop
sum_f_55_2025,"Total population female, ages 55 to 60, 2025",,world_pop
sum_f_60_2025,"Total population female, ages 60 to 65, 2025",,world_pop
sum_f_65_2025,"Total population female, ages 65 to 70, 2025",,world_pop
sum_f_70_2025,"Total population female, ages 70 to 75, 2025",,world_pop
sum_f_75_2025,"Total population female, ages 75 to 80, 2025",,world_pop
sum_f_80_2025,"Total population female, ages 80 to 85, 2025",,world_pop
sum_f_85_2025,"Total population female, ages 85 to 90, 2025",,world_pop
sum_f_90_2025,"Total population female, ages 90 and above, 2025",,world_pop
sum_m_00_2025,"Total population male, ages 0 to 1, 2025",,world_pop
sum_m_01_2025,"Total population male, ages 1 to 10, 2025",,world_pop
sum_m_05_2025,"Total population male, ages 5 to 10, 2025",,world_pop
sum_m_10_2025,"Total population male, ages 10 to 15, 2025",,world_pop
sum_m_15_2025,"Total population male, ages 15 to 20, 2025",,world_pop
sum_m_20_2025,"Total population male, ages 20 to 25, 2025",,world_pop
sum_m_25_2025,"Total population male, ages 25 to 30, 2025",,world_pop
sum_m_30_2025,"Total population male, ages 30 to 35, 2025",,world_pop
sum_m_35_2025,"Total population male, ages 35 to 40, 2025",,world_pop
sum_m_40_2025,"Total population male, ages 40 to 45, 2025",,world_pop
sum_m_45_2025,"Total population male, ages 45 to 50, 2025",,world_pop
sum_m_50_2025,"Total population male, ages 50 to 55, 2025",,world_pop
sum_m_55_2025,"Total population male, ages 55 to 60, 2025",,world_pop
sum_m_60_2025,"Total population male, ages 60 to 65, 2025",,world_pop
sum_m_65_2025,"Total population male, ages 65 to 70, 2025",,world_pop
sum_m_70_2025,"Total population male, ages 70 to 75, 2025",,world_pop
sum_m_75_2025,"Total population male, ages 75 to 80, 2025",,world_pop
sum_m_80_2025,"Total population male, ages 80 to 85, 2025",,world_pop
sum_m_85_2025,"Total population male, ages 85 to 90, 2025",,world_pop
sum_m_90_2025,"Total population male, ages 90 and above, 2025",,world_pop
sum_pop_2015,"Total population, 2015",,world_pop
sum_pop_2016,"Total population, 2016",,world_pop
sum_pop_2017,"Total population, 2017",,world_pop
sum_pop_2018,"Total population, 2018",,world_pop
sum_pop_2019,"Total population, 2019",,world_pop
sum_pop_2020,"Total population, 2020",,world_pop
sum_pop_2021,"Total population, 2021",,world_pop
sum_pop_2022,"Total population, 2022",,world_pop
sum_pop_2023,"Total population, 2023",,world_pop
sum_pop_2024,"Total population, 2024",,world_pop
sum_pop_2025,"Total population, 2025",,world_pop
sum_pop_2026,"Total population, 2026",,world_pop
sum_pop_2027,"Total population, 2027",,world_pop
sum_pop_2028,"Total population, 2028",,world_pop
sum_pop_2029,"Total population, 2029",,world_pop
sum_pop_2030,"Total population, 2030",,world_pop
sum_f_2025,"Total female population, all ages, 2025",,world_pop
sum_m_2025,"Total male population, all ages, 2025",,world_pop
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
Group,Field,Value
Identification,Title,Space2Stats Database
Identification,Identifier,GLO_2024_SPACE2STATS_GEO_v01
Identification,Hierarchy level,dataset
Identification,Edition,v.1
Identification,Edition Date,"45,541"
Identification,Status,
Identification,Language,ENG
Identification,Characterset,utf-8
Identification,Date,
Identification,"45,541",creation
Identification,Graphic overview,
Identification,Responsible party,"Ben Stewart (Task Leader), Andres Chamorro (Collaborator), Development Data Group (DECDG), World Bank"
Identification,Presentation form,API
Identification,Series name,Space2Stats Hexagonal Grid Database
Identification,Citation,
Identification,Abstract,"This database contains geospatial statistics for the entire globe standardized to a hexagonal grid. The spatial unit of the dataset is the H3 level 6 (approximately 36 sq. km. per cell). The variables cover a wide range of geographic themes relevant to international development, including demographic, socio-economic, environmental, climate, and infrastructure. An API enables users to query, access, and aggregate statistics from the Space2Stats database."
Identification,Purpose,The purpose of this API is to facilitate the generation of sub-national geospatial aggregates for any administrative boundary set.
Identification,Point of contact,"Andres Chamorro (Collaborator)
World Bank, Development Data Group (DECDG)
achamorroelizond@worldbank.org"
Identification,Resource maintenance,
Identification,Update frequency,As needed
Identification,Descriptive keywords,?
Identification,Spatial representation type,vector
Spatial extent,Place,Global
Spatial extent,East,180.00
Spatial extent,West,-180.00
Spatial extent,North,89.99
Spatial extent,South,-89.99
Spatial extent,Reference system,
Spatial extent,Code,"4,326"
Spatial extent,Code space,EPSG
Constraints,Access constraints,unrestricted
Constraints,Use constraints,unrestricted
Constraints,Use limitations,The information contained in this dataset is for general information purpose only.
Distribution,Distribution format,
Distribution,Name,json
Distribution,Specification,API response. Docs: https://space2stats.ds.io/docs
Distribution,Distributor,Development Seed
Data quality,Lineage statement,TBD: description of dataset creation
Data quality,Lineage process step,TBD: methodology
Data quality,Processor,GOST and DevSeed
Metadata,Metadata standard,ISO 19115-1:2014
Metadata,Date stamp,"45,541"
Metadata,Language,ENG
Metadata,Contacts,"Andres Chamorro (collaborator)
World Bank, Development Data Group (DECDG)"
Loading
Loading