Carbon Dioxide and Methane Concentrations from the Los Angeles Megacity Carbon Project

Documentation of data transformation

Author

Paridhi Parajuli

Published

September 19, 2024

This script was used to transform the the Los Angeles Megacity Carbon Project dataset into meaningful csv files for ingestion to vector dataset.

import pandas as pd
import glob
import os
import warnings
import warnings 
warnings.filterwarnings("ignore", category=RuntimeWarning)

# download data from https://data.nist.gov/od/id/mds2-2388 into your desired_folder
source_dir = "CA"

# Grouping the files for preparation
config_ca = pd.read_csv("LAM_sites-2.csv") #metadata from providers
all_files= glob.glob(f"{source_dir}/*.csv")
all_files = [i.split("/")[-1].split('.')[0] for i in glob.glob(f"{source_dir}/*.csv") ]
my_dict={}
for site in list(config_ca.SiteCode):
    # for each site and variable, append into the dict
    if (config_ca[config_ca["SiteCode"]==site]["Tower"].item()) ==1 :

        co2_files = [f for f in all_files if site in f and "upwind" not in f and "all" not in f and "co2" in f]
        my_dict[f"{site}-co2"] = co2_files
        # Find the files that do not have "upwind" or "all" and have "ch4"
        ch4_files = [f for f in all_files if site in f and "upwind" not in f and "all" not in f and "ch4" in f]
        my_dict[f"{site}-ch4"] = ch4_files
    else:
        co2_upwind_files = [f for f in all_files if site in f and "upwind" in f and "co2" in f]
        my_dict[f"{site}-co2"] = co2_upwind_files
        
        # Find the files that have "upwind" and "ch4"
        ch4_upwind_files = [f for f in all_files if site in f and "upwind" in f and "ch4" in f]
        my_dict[f"{site}-ch4"] = ch4_upwind_files

        if site in ["IRV","RAN"]:
            co2_files = [f for f in all_files if site in f and "all" in f and "co2" in f]
            my_dict[f"{site}-co2"] = co2_files
            ch4_files = [f for f in all_files if site in f and "all" in f and "ch4" in f]
            my_dict[f"{site}-ch4"] = ch4_files
        
del my_dict['USC2-co2']
del my_dict['USC2-ch4']

for key in my_dict:
    my_dict[key] = sorted(my_dict[key])

# code to generate transformed data for CA
output_dir = "output_LAM"
os.makedirs(output_dir,exist_ok=True)
for key, value in my_dict.items():
    df=pd.DataFrame()
    variable = key.split("-")[-1]
    val = f"{variable}_ppm" if variable == 'co2' else f"{variable}_ppb"
    columns = ["latitude","longitude","intake_height_m","elevation_m","datetime",val ]
    for file in value:
        tmp = pd.read_csv(f"CA/{file}.csv")
        tmp.dropna(subset=[val], inplace=True)
        tmp.rename(columns={'datetime_UTC': 'datetime'}, inplace=True)
        tmp= tmp[columns]
        tmp.rename(columns={val: 'value'}, inplace=True)
        tmp['datetime'] = pd.to_datetime(tmp['datetime'])
        tmp['datetime'] = tmp['datetime'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
        tmp['location'] = config_ca[config_ca['SiteCode']==site]["Location"].item()
        df = pd.concat([df, tmp], ignore_index=True)
        
    df['year']= df['datetime'].apply(lambda x: x[:4])
    result = df.groupby("year").agg(max_height= ("intake_height_m","max"))
    if result['max_height'].std() !=0:
        print(f"More than one max height for {file}",result['max_height'].unique())
    merged_df=pd.merge(df, result, on='year')
    merged_df["is_max_height_data"]= merged_df["max_height"] == merged_df["intake_height_m"]
    merged_df=merged_df.drop(columns=['year','max_height'])
    merged_df.reset_index(drop=True, inplace=True)
    merged_df.to_csv(f"{output_dir}/NIST-testbed-LAM-{key}-hourly-concentrations.csv", index=False)