import xarray
import re
import tempfile
import numpy as np
import boto3
import os
import gzip,shutil, wget
import s3fs
import hashlib
import json
ODIAC Fossil Fuel CO₂ Emissions
Documentation of data transformation
This script was used to transform the ODIAC Fossil Fuel CO₂ Emissions dataset from GeoTIFF to Cloud Optimized GeoTIFF (COG) format for display in the Greenhouse Gas (GHG) Center.
= boto3.session.Session()
session = session.client("s3")
s3_client = s3fs.S3FileSystem()
fs
= "data/"
data_dir = "odiac-ffco2-monthgrid-v2023"
dataset_name = "ghgc-data-store-develop"
cog_data_bucket = f"transformed_cogs/{dataset_name}"
cog_data_prefix= "checksum" cog_checksum_prefix
# Retrieve the checksum of raw files
={}
checksum_dict for year in range(2000,2023):
= f"https://db.cger.nies.go.jp/nies_data/10.17595/20170411.001/odiac2023/1km_tiff/{year}/odiac2023_1km_checksum_{year}.md5.txt"
checksum_url = requests.get(checksum_url)
response = response.text
content ={}
tmp
# Split the content into lines
= content.splitlines()
lines
for line in lines:
= line.split()
checksum, filename -3]] = checksum
tmp[filename[:
checksum_dict.update(tmp)= {k: v for k, v in checksum_dict.items() if k.endswith('.tif')} checksum_dict
def calculate_md5(file_path):
"""
Calculate the MD5 hash of a file.
Parameters:
file_path (str): The path to the file.
Returns:
str: The MD5 hash of the file.
"""
= hashlib.md5()
hash_md5 with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)return hash_md5.hexdigest()
#Code to download raw ODIAC data in your local machine
# Creating a base directory for ODIAC data
if not os.path.exists(data_dir):
os.makedirs(data_dir)
={}
checksum_dict_local# Download and unzip data for the years you want
for year in range(2000,2023):
= os.path.join(data_dir, str(year))
year_dir = f"https://db.cger.nies.go.jp/nies_data/10.17595/20170411.001/odiac2023/1km_tiff/{year}/odiac2023_1km_checksum_{year}.md5.txt"
checksum_download_link
wget.download(checksum_download_link, year_dir)# Make a subfolder for each year
if not os.path.exists(year_dir):
os.makedirs(year_dir)
for month in range(1,13):
= f"{month:02d}"
month = f"https://db.cger.nies.go.jp/nies_data/10.17595/20170411.001/odiac2023/1km_tiff/{year}/odiac2023_1km_excl_intl_{str(year)[-2:]}{month}.tif.gz"
download_link = f"{data_dir}/{year}/"
target_folder = os.path.basename(download_link)
fname = os.path.join(target_folder, fname)
target_path
# Download the file
wget.download(download_link, target_path)
# Unzip the file
with gzip.open(target_path, 'rb') as f_in:
with open(target_path[:-3], 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
# Calculate checksum of the .gz file
"/")[-1][:-3]]=calculate_md5(target_path)
checksum_dict_local[target_path.split(
# Remove the zip file
os.remove(target_path)
# check if the checksums match
== checksum_dict checksum_dict_local
# List of years you want to run the transformation on
=[str(i) for i in range(2020,2023)]
fold_names
for fol_ in fold_names:
= os.listdir(f"{data_dir}{fol_}")
names= [name for name in names if name.endswith('.tif')]
namesprint("For year: " ,fol_)
for name in names:
= xarray.open_dataarray(f"{data_dir}{fol_}/{name}")
xds = name.split("/ ")[-1]
filename = re.split("[_ .]", filename)
filename_elements
# Remove the extension
filename_elements.pop()# Extract and insert date of generated COG into filename
-1] = fol_ + filename_elements[-1][-2:]
filename_elements[
# Replace 0 values with -9999
= xds.where(xds!=0, -9999)
xds "x", "y", inplace=True)
xds.rio.set_spatial_dims(-9999, inplace=True)
xds.rio.write_nodata("epsg:4326", inplace=True)
xds.rio.write_crs(
= "_".join(filename_elements)
cog_filename = f"{cog_filename}.tif"
cog_filename
# Write the cog file to s3
with tempfile.NamedTemporaryFile() as temp_file:
xds.rio.to_raster(
temp_file.name,="COG",
driver="DEFLATE"
compress
)
s3_client.upload_file(=temp_file.name,
Filename=cog_data_bucket,
Bucket=f"{cog_data_prefix}/{cog_filename}",
Key
)
print(f"Generated and saved COG: {cog_filename}")
print("ODIAC COGs generation completed!!!")
# This block is used to calculate the SHA for each COG file and store in a JSON.
def get_all_s3_keys(bucket, model_name, ext):
"""Get a list of all keys in an S3 bucket."""
= []
keys
= {"Bucket": bucket, "Prefix": f"{model_name}/"}
kwargs while True:
= s3_client.list_objects_v2(**kwargs)
resp for obj in resp["Contents"]:
if obj["Key"].endswith(ext) and "historical" not in obj["Key"]:
"Key"])
keys.append(obj[
try:
"ContinuationToken"] = resp["NextContinuationToken"]
kwargs[except KeyError:
break
return keys
= get_all_s3_keys(cog_data_bucket, cog_data_prefix,".tif")
keys
def compute_sha256(url):
"""Compute SHA-256 checksum for a given file."""
= hashlib.sha256()
sha256_hash with fs.open(url) as f:
for byte_block in iter(lambda: f.read(4096), b""):
sha256_hash.update(byte_block)return sha256_hash.hexdigest()
= {}
sha_mapping for key in keys:
"/")[-1]]=compute_sha256(f"s3://{cog_data_bucket}/{key}")
sha_mapping[key.split(
= json.dumps(sha_mapping, indent=4)
json_data =cog_data_bucket, Key=f"{cog_checksum_prefix}/{dataset_name}.json", Body=json_data)
s3_client.put_object(Bucket
print("Checksums created for ODIAC!!!")