import xarray
import pandas as pd
import boto3
import glob
import s3fs
import tempfile
from datetime import datetime
import os
import boto3
from pyproj import CRS
import numpy as np
import rasterio
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.enums import Resampling
from rio_cogeo.cogeo import cog_translate
from rio_cogeo.profiles import cog_profiles
Vulcan Fossil Fuel CO₂ Emissions
Documentation of data transformation & Validation
This script was used to transform the VULCAN dataset provided in GeoTIFF format for display in the Greenhouse Gas (GHG) Center with the calaulation of validation statistics.
= {
config "data_acquisition_method": "s3",
"raw_data_bucket" : "gsfc-ghg-store",
"raw_data_prefix": "Vulcan/v4.0/grid.1km.mn",
"cog_data_bucket": "ghgc-data-store-develop",
"cog_data_prefix": "transformed_cogs/VULCAN_v4"
}
= boto3.session.Session()
session = session.client("s3")
s3_client
= config["raw_data_bucket"]
raw_data_bucket = config["raw_data_prefix"]
raw_data_prefix
= config['cog_data_bucket']
cog_data_bucket = config["cog_data_prefix"]
cog_data_prefix
=config['date_fmt']
date_fmt
= s3fs.S3FileSystem() fs
def get_all_s3_keys(bucket, model_name, ext):
"""Get a list of all keys in an S3 bucket."""
= []
keys
= {"Bucket": bucket, "Prefix": f"{model_name}/"}
kwargs while True:
= s3_client.list_objects_v2(**kwargs)
resp for obj in resp["Contents"]:
if obj["Key"].endswith(ext) and "historical" not in obj["Key"]:
"Key"])
keys.append(obj[
try:
"ContinuationToken"] = resp["NextContinuationToken"]
kwargs[except KeyError:
break
return keys
= get_all_s3_keys(raw_data_bucket, raw_data_prefix, ".tif") keys
len(keys)
# To calculate the validation stats
= pd.DataFrame(columns=["data","min","max","mean","std"]) overall
# Step 1: Reproject the data
# Define the source and target CRS
# Also calculate raw - monthly validation stats
"reproj", exist_ok=True)
os.makedirs(= CRS.from_wkt('PROJCS["unknown",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["latitude_of_origin",40],PARAMETER["central_meridian",-97],PARAMETER["standard_parallel_1",33],PARAMETER["standard_parallel_2",45],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH]]')
src_crs = CRS.from_epsg(4326) # WGS 84
dst_crs = pd.DataFrame(columns=['filename', 'min(raw)', 'max(raw)', 'mean(raw)', 'std(raw)'])
df = []
overall_rawfor key in keys:
= f"s3://{raw_data_bucket}/{key}"
url with rasterio.open(url) as src:
= key.split("/")[-1].split(".")[:-1]
filename_elements = "_".join(filename_elements) + ".tif"
output_tif = src.read(1) # Read the first band
data
overall_raw.append(data)
# Calculate statistics while ignoring NaN values
= np.nanmin(data)
min_val = np.nanmax(data)
max_val = np.nanmean(data)
mean_val = np.nanstd(data)
std_val = [output_tif, min_val, max_val, mean_val, std_val]
stats len(df)] = stats
df.loc[
= calculate_default_transform(
transform, width, height *src.bounds)
src.crs, dst_crs, src.width, src.height, = src.meta.copy()
kwargs
kwargs.update({'crs': dst_crs,
'transform': transform,
'width': width,
'height': height,
'nodata': -9999
})
with rasterio.open(f"reproj/{output_tif}", 'w', **kwargs) as dst:
for i in range(1, src.count + 1):
reproject(=rasterio.band(src, i),
source=rasterio.band(dst, i),
destination=src.transform,
src_transform=src.crs,
src_crs=transform,
dst_transform=dst_crs,
dst_crs=Resampling.nearest)
resamplingprint(f"Done for {output_tif}")
# overall validation of raw data
= np.array(overall_raw)
overall_raw= np.nanmin(overall_raw)
nan_min = np.nanmax(overall_raw)
nan_max = np.nanmean(overall_raw)
nan_mean = np.nanstd(overall_raw)
nan_std len(overall)] = ["raw",nan_min,nan_max,nan_mean,nan_std] overall.loc[
# validation for reprojected data - yearly calculation
= []
overall_reproj = glob.glob("reproj/*.tif")
files = pd.DataFrame(columns=['filename', 'min(reprojected)', 'max(reprojected)', 'mean(reprojected)', 'std(reprojected)'])
df1 for file in files:
with rasterio.open(file) as src:
= file.split("/")[-1].split(".")[:-1]
filename_elements = "_".join(filename_elements) + ".tif"
output_tif = src.read(1)
data = np.ma.masked_equal(data, -9999)
data
overall_reproj.append(data)
# Calculate statistics while ignoring NaN values
= np.nanmin(data)
min_val = np.nanmax(data)
max_val = np.nanmean(data)
mean_val = np.nanstd(data)
std_val = [output_tif, min_val, max_val, mean_val, std_val]
stats len(df1)] = stats df1.loc[
# overall validation of reprojected data
= np.array(overall_reproj)
overall_reproj= np.ma.masked_equal(overall_reproj, -9999)
overall_reproj = np.nanmin(overall_reproj)
nan_min = np.nanmax(overall_reproj)
nan_max = np.nanmean(overall_reproj)
nan_mean = np.nanstd(overall_reproj)
nan_std len(overall)] = ["reprojected",nan_min,nan_max,nan_mean,nan_std] overall.loc[
# Step 2: Replace nan and 0 values with -9999
"reproj2", exist_ok=True)
os.makedirs(= glob.glob("reproj/*.tif")
files for file in files:
= file.split('/')[-1]
filename = xarray.open_dataarray(file).sel(band=1)
xda
# Multiply data
= data *( 44/12)
data
= xda.where(xda != 0, -9999) # Replace 0 with -9999
data #data = data.where(data != -3.4e+38, -9999) # Replace -3.4e+38 with -9999
= data.fillna(-9999) # Ensure all NaNs are replaced with -9999
data = data.values
data_array
# Open the source raster to get metadata
with rasterio.open(file) as src:
= src.meta
meta
meta.update({'nodata': -9999,
'dtype': 'float32',
'driver': 'COG'
})with rasterio.open(f"reproj2/{filename}", 'w', **meta) as dst:
1) dst.write(data_array,
# validation for reprojected data (non zero) - monthly calculation
=[]
overall_reproj2= glob.glob("reproj/*.tif")
files = pd.DataFrame(columns=['filename', 'min(reproj_nonzero)', 'max(reproj_nonzero)', 'mean(reproj_nonzero)', 'std(reproj_nonzero)'])
df11 for file in files:
with rasterio.open(file) as src:
= file.split("/")[-1].split(".")[:-1]
filename_elements = "_".join(filename_elements) + ".tif"
output_tif = src.read(1)
data = np.ma.masked_where((data == -9999) | (data == 0), data)
data
overall_reproj2.append(data)
# Calculate statistics while ignoring NaN values
= np.nanmin(data)
min_val = np.nanmax(data)
max_val = np.nanmean(data)
mean_val = np.nanstd(data)
std_val = [output_tif, min_val, max_val, mean_val, std_val]
stats len(df11)] = stats df11.loc[
# validation for reprojected data (non zero) - overall calculation
= np.array(overall_reproj2)
overall_reproj2= np.ma.masked_where((overall_reproj2 == -9999) | (overall_reproj2 == 0), overall_reproj2)
overall_reproj2 = np.nanmin(overall_reproj2)
nan_min = np.nanmax(overall_reproj2)
nan_max = np.nanmean(overall_reproj2)
nan_mean = np.nanstd(overall_reproj2)
nan_std len(overall)] = ["reprojected_non_zero",nan_min,nan_max,nan_mean,nan_std] overall.loc[
# Step 3: To put overviews
= {"driver": "COG", "compress": "DEFLATE"}
COG_PROFILE = 9
OVERVIEW_LEVELS = 'average'
OVERVIEW_RESAMPLING
for file in glob.glob("reproj2/*.tif"):
= f"output/{file.split("/")[-1]}"
output_path
# Create a temporary file to hold the COG
with tempfile.NamedTemporaryFile(suffix='.tif', delete=False) as temp_file:
# Create COG with overviews and nodata value
cog_translate(file,
temp_file.name,"deflate"),
cog_profiles.get(=OVERVIEW_LEVELS,
overview_level=OVERVIEW_RESAMPLING,
overview_resampling=-9999
nodata
)# Move the temporary file to the desired local path
os.rename(temp_file.name, output_path)
# validation for final data with overviews - overall calculation
=[]
overall_final= glob.glob("output/*.tif")
files = pd.DataFrame(columns=['filename', 'min(transformed)', 'max(transformed)', 'mean(transformed)', 'std(transformed)'])
df2 for file in files:
with rasterio.open(file) as src:
= file.split("/")[-1].split(".")[:-1]
filename_elements = "_".join(filename_elements) + ".tif"
output_tif = src.read(1) # Read the first band
data
# Mask -9999 values and NaNs for statistics calculation
= np.ma.masked_where((data == -9999) | np.isnan(data), data)
data # Multiply data - undo the multiplication done during transformation
= data *( 12/44)
data
overall_final.append(data)
# Calculate statistics while ignoring NaN values
= np.nanmin(data)
min_val = np.nanmax(data)
max_val = np.nanmean(data)
mean_val = np.nansum(data)
total = [output_tif, min_val, max_val, mean_val, std_val]
stats len(df2)] = stats df2.loc[
# validation for final data (with overviews) - overall calculation
= np.array(overall_final)
overall_final= np.ma.masked_where((overall_final == -9999) | np.isnan(overall_final), overall_final)
overall_final = np.nanmin(overall_final)
nan_min = np.nanmax(overall_final)
nan_max = np.nanmean(overall_final)
nan_mean = np.nanstd(overall_final)
nan_std len(overall)] = ["Transformed",nan_min,nan_max,nan_mean,nan_std] overall.loc[
='filename', how='inner'), pd.merge(df11,df2, on='filename', how='inner'), how='inner',on='filename' ) pd.merge(pd.merge(df,df1, on
overall
# Save to json
"overall_stats.json")
overall.to_json(='filename', how='inner'), pd.merge(df11,df2, on='filename', how='inner'), how='inner',on='filename' ).to_json("yearly_stats.json") pd.merge(pd.merge(df,df1, on