import xarray
import re
import pandas as pd
import boto3
import glob
import s3fs
import tempfile
from datetime import datetime
import os
import boto3
from pyproj import CRS
from rasterio.io import MemoryFile
import numpy as np
import rasterio
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.enums import Resampling
from rio_cogeo.cogeo import cog_translate
from rio_cogeo.profiles import cog_profilesVulcan Fossil Fuel CO₂ Emissions v4.0
Documentation of data transformation & Validation
Updated on : June 17, 2025
This script was used to transform the VULCAN dataset provided in Cloud Optimized GeoTIFF (COG) format for display in the Greenhouse Gas (GHG) Center with the calaulation of validation statistics.
config = {
"data_acquisition_method": "s3",
"raw_data_bucket" : "gsfc-ghg-store",
"raw_data_prefix": "Vulcan/v4.0/grid.1km.mn",
"cog_data_bucket": "ghgc-data-store-develop",
"cog_data_prefix": "transformed_cogs/VULCAN_v4",
"date_fmt" :"%Y",
"transformation": {}
}session = boto3.session.Session()
s3_client = session.client("s3")
raw_data_bucket = config["raw_data_bucket"]
raw_data_prefix= config["raw_data_prefix"]
cog_data_bucket = config['cog_data_bucket']
cog_data_prefix= config["cog_data_prefix"]
date_fmt=config['date_fmt']
fs = s3fs.S3FileSystem()def get_all_s3_keys(bucket, model_name, ext):
"""Get a list of all keys in an S3 bucket."""
keys = []
kwargs = {"Bucket": bucket, "Prefix": f"{model_name}/"}
while True:
resp = s3_client.list_objects_v2(**kwargs)
for obj in resp["Contents"]:
if obj["Key"].endswith(ext) and "historical" not in obj["Key"]:
keys.append(obj["Key"])
try:
kwargs["ContinuationToken"] = resp["NextContinuationToken"]
except KeyError:
break
return keys
keys = get_all_s3_keys(raw_data_bucket, raw_data_prefix, ".tif")keys=[k for k in keys if len(k)<72] # ommiting the not required fileslen(keys)# To calculate the validation stats
overall= pd.DataFrame(columns=["data","min","max","mean","std"])# Step 1: Reproject the data
# Define the source and target CRS
# Also calculate raw - monthly validation stats
os.makedirs("reproj", exist_ok=True)
src_crs = CRS.from_wkt('PROJCS["unknown",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["latitude_of_origin",40],PARAMETER["central_meridian",-97],PARAMETER["standard_parallel_1",33],PARAMETER["standard_parallel_2",45],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH]]')
dst_crs = CRS.from_epsg(4326) # WGS 84
df = pd.DataFrame(columns=['filename', 'min(raw)', 'max(raw)', 'mean(raw)', 'std(raw)'])
overall_raw= []
for key in keys:
url = f"s3://{raw_data_bucket}/{key}"
with rasterio.open(url) as src:
filename_elements = key.split("/")[-1].split(".")[:-1]
output_tif = "_".join(filename_elements) + ".tif"
data = src.read(1) # Read the first band
overall_raw.append(data)
# Calculate statistics while ignoring NaN values
min_val = np.nanmin(data)
max_val = np.nanmax(data)
mean_val = np.nanmean(data)
std_val = np.nanstd(data)
stats = [output_tif, min_val, max_val, mean_val, std_val]
df.loc[len(df)] = stats
transform, width, height = calculate_default_transform(
src.crs, dst_crs, src.width, src.height, *src.bounds)
kwargs = src.meta.copy()
kwargs.update({
'crs': dst_crs,
'transform': transform,
'width': width,
'height': height,
'nodata': -9999
})
with rasterio.open(f"reproj/{output_tif}", 'w', **kwargs) as dst:
for i in range(1, src.count + 1):
reproject(
source=rasterio.band(src, i),
destination=rasterio.band(dst, i),
src_transform=src.transform,
src_crs=src.crs,
dst_transform=transform,
dst_crs=dst_crs,
resampling=Resampling.nearest)
print(f"Done for {output_tif}")
# overall validation of raw data
overall_raw= np.array(overall_raw)
nan_min = np.nanmin(overall_raw)
nan_max = np.nanmax(overall_raw)
nan_mean = np.nanmean(overall_raw)
nan_std = np.nanstd(overall_raw)
overall.loc[len(overall)] = ["raw",nan_min,nan_max,nan_mean,nan_std]# validation for reprojected data - yearly calculation
overall_reproj = []
files = glob.glob("reproj/*.tif")
df1 = pd.DataFrame(columns=['filename', 'min(reprojected)', 'max(reprojected)', 'mean(reprojected)', 'std(reprojected)'])
for file in files:
with rasterio.open(file) as src:
filename_elements = file.split("/")[-1].split(".")[:-1]
output_tif = "_".join(filename_elements) + ".tif"
data = src.read(1)
data = np.ma.masked_equal(data, -9999)
overall_reproj.append(data)
# Calculate statistics while ignoring NaN values
min_val = np.nanmin(data)
max_val = np.nanmax(data)
mean_val = np.nanmean(data)
std_val = np.nanstd(data)
stats = [output_tif, min_val, max_val, mean_val, std_val]
df1.loc[len(df1)] = stats# overall validation of reprojected data
overall_reproj= np.array(overall_reproj)
overall_reproj = np.ma.masked_equal(overall_reproj, -9999)
nan_min = np.nanmin(overall_reproj)
nan_max = np.nanmax(overall_reproj)
nan_mean = np.nanmean(overall_reproj)
nan_std = np.nanstd(overall_reproj)
overall.loc[len(overall)] = ["reprojected",nan_min,nan_max,nan_mean,nan_std]# Step 2: Replace nan and 0 values with -9999 and multiply
os.makedirs("reproj2", exist_ok=True)
files = glob.glob("reproj/*.tif")
for file in files:
filename = file.split('/')[-1]
xda = xarray.open_dataarray(file).sel(band=1)
# Multiply data
xda = xda.where(xda == -9999, xda * (44/12))
data = xda.where(xda != 0, -9999) # Replace 0 with -9999
#data = data.where(data != -3.4e+38, -9999) # Replace -3.4e+38 with -9999
data = data.fillna(-9999) # Ensure all NaNs are replaced with -9999
data_array = data.values
# Open the source raster to get metadata
with rasterio.open(file) as src:
meta = src.meta
meta.update({
'nodata': -9999,
'dtype': 'float32',
'driver': 'COG'
})
with rasterio.open(f"reproj2/{filename}", 'w', **meta) as dst:
dst.write(data_array, 1)# validation for reprojected data (non zero) - monthly calculation
overall_reproj2=[]
files = glob.glob("reproj/*.tif")
df11 = pd.DataFrame(columns=['filename', 'min(reproj_nonzero)', 'max(reproj_nonzero)', 'mean(reproj_nonzero)', 'std(reproj_nonzero)'])
for file in files:
with rasterio.open(file) as src:
filename_elements = file.split("/")[-1].split(".")[:-1]
output_tif = "_".join(filename_elements) + ".tif"
data = src.read(1)
data = np.ma.masked_where((data == -9999) | (data == 0), data)
overall_reproj2.append(data)
# Calculate statistics while ignoring NaN values
min_val = np.nanmin(data)
max_val = np.nanmax(data)
mean_val = np.nanmean(data)
std_val = np.nanstd(data)
stats = [output_tif, min_val, max_val, mean_val, std_val]
df11.loc[len(df11)] = stats# validation for reprojected data (non zero) - overall calculation
overall_reproj2= np.array(overall_reproj2)
overall_reproj2 = np.ma.masked_where((overall_reproj2 == -9999) | (overall_reproj2 == 0), overall_reproj2)
nan_min = np.nanmin(overall_reproj2)
nan_max = np.nanmax(overall_reproj2)
nan_mean = np.nanmean(overall_reproj2)
nan_std = np.nanstd(overall_reproj2)
overall.loc[len(overall)] = ["reprojected_non_zero",nan_min,nan_max,nan_mean,nan_std]# Step 3: To put overviews
COG_PROFILE = {"driver": "COG", "compress": "DEFLATE"}
OVERVIEW_LEVELS = 9
OVERVIEW_RESAMPLING = 'average'
for file in glob.glob("reproj2/*.tif"):
output_path = f"output/{file.split("/")[-1]}"
# Create a temporary file to hold the COG
with tempfile.NamedTemporaryFile(suffix='.tif', delete=False) as temp_file:
# Create COG with overviews and nodata value
cog_translate(
file,
temp_file.name,
cog_profiles.get("deflate"),
overview_level=OVERVIEW_LEVELS,
overview_resampling=OVERVIEW_RESAMPLING,
nodata=-9999
)
# Move the temporary file to the desired local path
os.rename(temp_file.name, output_path)# validation for final data with overviews - overall calculation
overall_final=[]
files = glob.glob("output/*.tif")
df2 = pd.DataFrame(columns=['filename', 'min(transformed)', 'max(transformed)', 'mean(transformed)', 'std(transformed)'])
for file in files:
with rasterio.open(file) as src:
filename_elements = file.split("/")[-1].split(".")[:-1]
output_tif = "_".join(filename_elements) + ".tif"
data = src.read(1) # Read the first band
# Mask -9999 values and NaNs for statistics calculation
data = np.ma.masked_where((data == -9999) | np.isnan(data), data)
# Multiply data - undo the multiplication done during transformation
data = data *( 12/44)
overall_final.append(data)
# Calculate statistics while ignoring NaN values
min_val = np.nanmin(data)
max_val = np.nanmax(data)
mean_val = np.nanmean(data)
total = np.nansum(data)
std_val = np.nanstd(data)
stats = [output_tif, min_val, max_val, mean_val, std_val]
df2.loc[len(df2)] = stats# validation for final data (with overviews) - overall calculation
overall_final= np.array(overall_final)
overall_final = np.ma.masked_where((overall_final == -9999) | np.isnan(overall_final), overall_final)
nan_min = np.nanmin(overall_final)
nan_max = np.nanmax(overall_final)
nan_mean = np.nanmean(overall_final)
nan_std = np.nanstd(overall_final)
overall.loc[len(overall)] = ["Transformed",nan_min,nan_max,nan_mean,nan_std]# Save to json
overall.to_json("overall_stats.json")
pd.merge(pd.merge(df,df1, on='filename', how='inner'), pd.merge(df11,df2, on='filename', how='inner'), how='inner',on='filename' ).to_json("yearly_stats.json")