Vulcan Fossil Fuel CO₂ Emissions

Documentation of data transformation & Validation
Author

Paridhi Parajuli

Published

August 20, 2024

This script was used to transform the VULCAN dataset provided in GeoTIFF format for display in the Greenhouse Gas (GHG) Center with the calaulation of validation statistics.

import xarray
import pandas as pd
import boto3
import glob
import s3fs
import tempfile
from datetime import datetime
import os
import boto3
from pyproj import CRS
import numpy as np

import rasterio
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.enums import Resampling
from rio_cogeo.cogeo import cog_translate
from rio_cogeo.profiles import cog_profiles
config = {
    "data_acquisition_method": "s3",
    "raw_data_bucket" : "gsfc-ghg-store",
    "raw_data_prefix": "Vulcan/v4.0/grid.1km.mn",
    "cog_data_bucket": "ghgc-data-store-develop",
    "cog_data_prefix": "transformed_cogs/VULCAN_v4"
}
session = boto3.session.Session()
s3_client = session.client("s3")

raw_data_bucket = config["raw_data_bucket"]
raw_data_prefix= config["raw_data_prefix"]

cog_data_bucket = config['cog_data_bucket']
cog_data_prefix= config["cog_data_prefix"]

date_fmt=config['date_fmt']

fs = s3fs.S3FileSystem()
def get_all_s3_keys(bucket, model_name, ext):
    """Get a list of all keys in an S3 bucket."""
    keys = []

    kwargs = {"Bucket": bucket, "Prefix": f"{model_name}/"}
    while True:
        resp = s3_client.list_objects_v2(**kwargs)
        for obj in resp["Contents"]:
            if obj["Key"].endswith(ext) and "historical" not in obj["Key"]:
                keys.append(obj["Key"])

        try:
            kwargs["ContinuationToken"] = resp["NextContinuationToken"]
        except KeyError:
            break

    return keys

keys = get_all_s3_keys(raw_data_bucket, raw_data_prefix, ".tif")
len(keys)
# To calculate the validation stats
overall= pd.DataFrame(columns=["data","min","max","mean","std"])
# Step 1: Reproject the data 
# Define the source and target CRS
# Also calculate raw - monthly validation stats
os.makedirs("reproj", exist_ok=True)
src_crs = CRS.from_wkt('PROJCS["unknown",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["latitude_of_origin",40],PARAMETER["central_meridian",-97],PARAMETER["standard_parallel_1",33],PARAMETER["standard_parallel_2",45],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH]]')
dst_crs = CRS.from_epsg(4326)  # WGS 84
df = pd.DataFrame(columns=['filename', 'min(raw)', 'max(raw)', 'mean(raw)', 'std(raw)'])
overall_raw= []
for key in keys:
    url = f"s3://{raw_data_bucket}/{key}"
    with rasterio.open(url) as src:
        filename_elements = key.split("/")[-1].split(".")[:-1]
        output_tif = "_".join(filename_elements) + ".tif"
        data = src.read(1)  # Read the first band
        overall_raw.append(data)
        
        # Calculate statistics while ignoring NaN values
        min_val = np.nanmin(data)
        max_val = np.nanmax(data)
        mean_val = np.nanmean(data)
        std_val = np.nanstd(data)  
        stats = [output_tif, min_val, max_val, mean_val, std_val]
        df.loc[len(df)] = stats
        
        transform, width, height = calculate_default_transform(
        src.crs, dst_crs, src.width, src.height, *src.bounds)
        kwargs = src.meta.copy()
        kwargs.update({
        'crs': dst_crs,
        'transform': transform,
        'width': width,
        'height': height,
        'nodata': -9999
        })

        with rasterio.open(f"reproj/{output_tif}", 'w', **kwargs) as dst:
            for i in range(1, src.count + 1):
                reproject(
                source=rasterio.band(src, i),
                destination=rasterio.band(dst, i),
                src_transform=src.transform,
                src_crs=src.crs,
                dst_transform=transform,
                dst_crs=dst_crs,
                resampling=Resampling.nearest)
        print(f"Done for {output_tif}")

# overall validation of raw data
overall_raw= np.array(overall_raw)
nan_min = np.nanmin(overall_raw)
nan_max = np.nanmax(overall_raw)
nan_mean = np.nanmean(overall_raw)
nan_std = np.nanstd(overall_raw)
overall.loc[len(overall)] = ["raw",nan_min,nan_max,nan_mean,nan_std]
# validation for reprojected data - yearly calculation
overall_reproj = []
files = glob.glob("reproj/*.tif")
df1 = pd.DataFrame(columns=['filename', 'min(reprojected)', 'max(reprojected)', 'mean(reprojected)', 'std(reprojected)'])
for file in files:
    with rasterio.open(file) as src:
        filename_elements = file.split("/")[-1].split(".")[:-1]
        output_tif = "_".join(filename_elements) + ".tif"
        data = src.read(1)  
        data = np.ma.masked_equal(data, -9999)
        overall_reproj.append(data)
        
        # Calculate statistics while ignoring NaN values
        min_val = np.nanmin(data)
        max_val = np.nanmax(data)
        mean_val = np.nanmean(data)
        std_val = np.nanstd(data)  
        stats = [output_tif, min_val, max_val, mean_val, std_val]
        df1.loc[len(df1)] = stats
# overall validation of reprojected  data
overall_reproj= np.array(overall_reproj)
overall_reproj = np.ma.masked_equal(overall_reproj, -9999)
nan_min = np.nanmin(overall_reproj)
nan_max = np.nanmax(overall_reproj)
nan_mean = np.nanmean(overall_reproj)
nan_std = np.nanstd(overall_reproj)
overall.loc[len(overall)] = ["reprojected",nan_min,nan_max,nan_mean,nan_std]
# Step 2: Replace nan and 0 values with -9999
os.makedirs("reproj2", exist_ok=True)
files = glob.glob("reproj/*.tif")
for file in files:
    filename = file.split('/')[-1]
    xda = xarray.open_dataarray(file).sel(band=1)

    # Multiply data
    data = data *( 44/12)
    
    data = xda.where(xda != 0, -9999)  # Replace 0 with -9999
    #data = data.where(data != -3.4e+38, -9999)  # Replace -3.4e+38 with -9999
    data = data.fillna(-9999)  # Ensure all NaNs are replaced with -9999
    data_array = data.values
    

    # Open the source raster to get metadata
    with rasterio.open(file) as src:
        meta = src.meta
        meta.update({
            'nodata': -9999,
            'dtype': 'float32',
            'driver': 'COG'
        })
        with rasterio.open(f"reproj2/{filename}", 'w', **meta) as dst:
            dst.write(data_array, 1)
# validation for reprojected data (non zero) - monthly calculation
overall_reproj2=[]
files = glob.glob("reproj/*.tif")
df11 = pd.DataFrame(columns=['filename', 'min(reproj_nonzero)', 'max(reproj_nonzero)', 'mean(reproj_nonzero)', 'std(reproj_nonzero)'])
for file in files:
    with rasterio.open(file) as src:
        filename_elements = file.split("/")[-1].split(".")[:-1]
        output_tif = "_".join(filename_elements) + ".tif"
        data = src.read(1)  
        data = np.ma.masked_where((data == -9999) | (data == 0), data)
        overall_reproj2.append(data)

        
        # Calculate statistics while ignoring NaN values
        min_val = np.nanmin(data)
        max_val = np.nanmax(data)
        mean_val = np.nanmean(data)
        std_val = np.nanstd(data)  
        stats = [output_tif, min_val, max_val, mean_val, std_val]
        df11.loc[len(df11)] = stats
# validation for reprojected data (non zero) - overall calculation
overall_reproj2= np.array(overall_reproj2)
overall_reproj2 = np.ma.masked_where((overall_reproj2 == -9999) | (overall_reproj2 == 0), overall_reproj2)
nan_min = np.nanmin(overall_reproj2)
nan_max = np.nanmax(overall_reproj2)
nan_mean = np.nanmean(overall_reproj2)
nan_std = np.nanstd(overall_reproj2)
overall.loc[len(overall)] = ["reprojected_non_zero",nan_min,nan_max,nan_mean,nan_std]
# Step 3: To put overviews
COG_PROFILE = {"driver": "COG", "compress": "DEFLATE"}
OVERVIEW_LEVELS = 9
OVERVIEW_RESAMPLING = 'average'

for file in glob.glob("reproj2/*.tif"):
    output_path = f"output/{file.split("/")[-1]}"
    
    # Create a temporary file to hold the COG
    with tempfile.NamedTemporaryFile(suffix='.tif', delete=False) as temp_file:       
        # Create COG with overviews and nodata value
        cog_translate(
            file,
            temp_file.name,
            cog_profiles.get("deflate"),
            overview_level=OVERVIEW_LEVELS,
            overview_resampling=OVERVIEW_RESAMPLING,
            nodata=-9999
        )
        # Move the temporary file to the desired local path
        os.rename(temp_file.name, output_path)
# validation for final data with overviews - overall calculation
overall_final=[]
files = glob.glob("output/*.tif")
df2 = pd.DataFrame(columns=['filename', 'min(transformed)', 'max(transformed)', 'mean(transformed)', 'std(transformed)'])
for file in files:
    with rasterio.open(file) as src:
        filename_elements = file.split("/")[-1].split(".")[:-1]
        output_tif = "_".join(filename_elements) + ".tif"
        data = src.read(1)  # Read the first band
        
        # Mask -9999 values and NaNs for statistics calculation
        data = np.ma.masked_where((data == -9999) | np.isnan(data), data)
        # Multiply data - undo the multiplication done during transformation
        data = data *( 12/44)
        overall_final.append(data)
        
        # Calculate statistics while ignoring NaN values
        min_val = np.nanmin(data)
        max_val = np.nanmax(data)
        mean_val = np.nanmean(data)
        total = np.nansum(data) 
        stats = [output_tif, min_val, max_val, mean_val, std_val]
        df2.loc[len(df2)] = stats
# validation for final data (with overviews) - overall calculation
overall_final= np.array(overall_final)
overall_final = np.ma.masked_where((overall_final == -9999) | np.isnan(overall_final), overall_final)
nan_min = np.nanmin(overall_final)
nan_max = np.nanmax(overall_final)
nan_mean = np.nanmean(overall_final)
nan_std = np.nanstd(overall_final)
overall.loc[len(overall)] = ["Transformed",nan_min,nan_max,nan_mean,nan_std]
pd.merge(pd.merge(df,df1, on='filename', how='inner'), pd.merge(df11,df2, on='filename', how='inner'), how='inner',on='filename' )
overall
# Save to json
overall.to_json("overall_stats.json")
pd.merge(pd.merge(df,df1, on='filename', how='inner'), pd.merge(df11,df2, on='filename', how='inner'), how='inner',on='filename' ).to_json("yearly_stats.json")
Back to top