import os
import xarray
import re
import pandas as pd
import json
import tempfile
import boto3
import rasterio
from datetime import datetime
from dateutil.relativedelta import relativedelta
GOSAT-based Top-down Methane Budgets
Documentation of data transformation
This script was used to transform the GOSAT-based Top-down Methane Budgets dataset from netCDF to Cloud Optimized GeoTIFF (COG) format for display in the Greenhouse Gas (GHG) Center.
= boto3.session.Session()
session = session.client("s3")
s3_client = (
bucket_name "ghgc-data-store-dev" # S3 bucket where the COGs are stored after transformation
)= datetime(2019, 1, 1)
year_ = "new_data/CH4-inverse-flux"
folder_name
= {"driver": "COG", "compress": "DEFLATE"}
COG_PROFILE
= pd.DataFrame(
files_processed =["file_name", "COGs_created"]
columns# A dataframe to keep track of the files that we have transformed into COGs
)
# Reading the raw netCDF files from local machine
for name in os.listdir(folder_name):
= xarray.open_dataset(
ds f"{folder_name}/{name}",
="netcdf4",
engine
)
= ds.rename({"dimy": "lat", "dimx": "lon"})
ds # assign coords from dimensions
= ds.assign_coords(lon=(((ds.lon + 180) % 360) - 180)).sortby("lon")
ds = ds.assign_coords(lat=((ds.lat / 180) * 180) - 90).sortby("lat")
ds
= [var for var in ds.data_vars]
variable
for var in variable[2:]:
= name.split("/ ")[-1]
filename = re.split("[_ .]", filename)
filename_elements = ds[var]
data
filename_elements.pop()2, var)
filename_elements.insert(= "_".join(filename_elements)
cog_filename # # add extension
= f"{cog_filename}.tif"
cog_filename
= data.reindex(lat=list(reversed(data.lat)))
data
"lon", "lat")
data.rio.set_spatial_dims("epsg:4326", inplace=True)
data.rio.write_crs(
# generate COG
= {"driver": "COG", "compress": "DEFLATE"}
COG_PROFILE
with tempfile.NamedTemporaryFile() as temp_file:
**COG_PROFILE)
data.rio.to_raster(temp_file.name,
s3_client.upload_file(=temp_file.name,
Filename=bucket_name,
Bucket=f"ch4_inverse_flux/{cog_filename}",
Key
)
= files_processed._append(
files_processed "file_name": name, "COGs_created": cog_filename},
{=True,
ignore_index
)
print(f"Generated and saved COG: {cog_filename}")
# Generate the json file with the metadata that is present in the netCDF files.
with tempfile.NamedTemporaryFile(mode="w+") as fp:
json.dump(ds.attrs, fp)"data_dimensions": dict(ds.dims)}, fp)
json.dump({"data_variables": list(ds.data_vars)}, fp)
json.dump({
fp.flush()
s3_client.upload_file(=fp.name,
Filename=bucket_name,
Bucket="ch4_inverse_flux/metadata.json",
Key
)
# creating the csv file with the names of files transformed.
files_processed.to_csv(f"s3://{bucket_name}/ch4_inverse_flux/files_converted.csv",
)print("Done generating COGs")