import os
import xarray
import re
import pandas as pd
import json
import tempfile
import boto3
import rasterio
from datetime import datetime
from dateutil.relativedelta import relativedelta
OCO-2 MIP Top-Down CO₂ Budgets
Documentation of data transformation
This script was used to transform the OCO-2 MIP Top-Down CO₂ Budgets dataset from netCDF to Cloud Optimized GeoTIFF (COG) format for display in the Greenhouse Gas (GHG) Center.
= boto3.session.Session()
session = session.client("s3")
s3_client = "ghgc-data-store-dev" # S3 bucket where the COGs are to be stored
bucket_name = datetime(2015, 1, 1) # Initialize the starting date time of the dataset.
year_
= {"driver": "COG", "compress": "DEFLATE"}
COG_PROFILE
# Reading the raw netCDF files from local machine
= pd.DataFrame(columns=["file_name", "COGs_created"]) # A dataframe to keep track of the files that are converted into COGs
files_processed for name in os.listdir("new_data"):
= xarray.open_dataset(
ds f"new_data/{name}",
="netcdf4",
engine
)= ds.rename({"latitude": "lat", "longitude": "lon"})
ds # assign coords from dimensions
= ds.assign_coords(lon=(((ds.lon + 180) % 360) - 180)).sortby("lon")
ds = ds.assign_coords(lat=list(ds.lat))
ds
= [var for var in ds.data_vars]
variable
for time_increment in range(0, len(ds.year)):
for var in variable[2:]:
= name.split("/ ")[-1]
filename = re.split("[_ .]", filename)
filename_elements try:
= ds[var].sel(year=time_increment)
data = year_ + relativedelta(years=+time_increment)
date -1] = date.strftime("%Y")
filename_elements[# # insert date of generated COG into filename
2, var)
filename_elements.insert(= "_".join(filename_elements)
cog_filename # # add extension
= f"{cog_filename}.tif"
cog_filename except KeyError:
= ds[var]
data = year_ + relativedelta(years=+(len(ds.year) - 1))
date
filename_elements.pop()"%Y"))
filename_elements.append(year_.strftime("%Y"))
filename_elements.append(date.strftime(2, var)
filename_elements.insert(= "_".join(filename_elements)
cog_filename # # add extension
= f"{cog_filename}.tif"
cog_filename
= data.reindex(lat=list(reversed(data.lat)))
data
"lon", "lat")
data.rio.set_spatial_dims("epsg:4326", inplace=True)
data.rio.write_crs(
# generate COG
= {"driver": "COG", "compress": "DEFLATE"}
COG_PROFILE with tempfile.NamedTemporaryFile() as temp_file:
**COG_PROFILE)
data.rio.to_raster(temp_file.name,
s3_client.upload_file(=temp_file.name,
Filename=bucket_name,
Bucket=f"ceos_co2_flux/{cog_filename}",
Key
)
= files_processed._append(
files_processed "file_name": name, "COGs_created": cog_filename},
{=True,
ignore_index
)
print(f"Generated and saved COG: {cog_filename}")
# creating the csv file with the names of files transformed.
files_processed.to_csv(f"s3://{bucket_name}/ceos_co2_flux/files_converted.csv",
)print("Done generating COGs")