import os
import xarray
import re
import pandas as pd
import tempfile
import boto3
SEDAC Gridded World Population Data
Documentation of data transformation
This script was used to transform SEDAC Gridded World Population Data from netCDF to Cloud Optimized GeoTIFF (COG) format for display in the Greenhouse Gas (GHG) Center.
= boto3.session.Session()
session = session.client("s3")
s3_client = (
bucket_name "ghgc-data-store-dev" # S3 bucket where the COGs are stored after transformation
)
= os.listdir("gpw")
fold_names
= pd.DataFrame(
files_processed =["file_name", "COGs_created"]
columns# A dataframe to keep track of the files that we have transformed into COGs
)
# Reading the raw netCDF files from local machine
for fol_ in fold_names:
for name in os.listdir(f"gpw/{fol_}"):
if name.endswith(".tif"):
= xarray.open_dataarray(f"gpw/{fol_}/{name}")
xds
= name.split("/ ")[-1]
filename = re.split("[_ .]", filename)
filename_elements # # insert date of generated COG into filename
filename_elements.pop()-3])
filename_elements.append(filename_elements[
"x", "y", inplace=True)
xds.rio.set_spatial_dims("epsg:4326", inplace=True)
xds.rio.write_crs(
= "_".join(filename_elements)
cog_filename # # add extension
= f"{cog_filename}.tif"
cog_filename
with tempfile.NamedTemporaryFile() as temp_file:
="COG")
xds.rio.to_raster(temp_file.name, driver
s3_client.upload_file(=temp_file.name,
Filename=bucket_name,
Bucket=f"gridded_population_cog/{cog_filename}",
Key
)
= files_processed._append(
files_processed "file_name": name, "COGs_created": cog_filename},
{=True,
ignore_index
)
print(f"Generated and saved COG: {cog_filename}")
# creating the csv file with the names of files transformed.
files_processed.to_csv(f"s3://{bucket_name}/gridded_population_cog/files_converted.csv",
)print("Done generating COGs")