import pandas as pd
import glob
import os
import warnings
import warnings
"ignore", category=RuntimeWarning) warnings.filterwarnings(
Carbon Dioxide and Methane Concentrations from the Los Angeles Megacity Carbon Project
Documentation of data transformation
This script was used to transform the the Los Angeles Megacity Carbon Project dataset into meaningful csv files for ingestion to vector dataset.
# download data from https://data.nist.gov/od/id/mds2-2388 into your desired_folder
= "CA" source_dir
# Grouping the files for preparation
= pd.read_csv("LAM_sites-2.csv") #metadata from providers
config_ca = glob.glob(f"{source_dir}/*.csv")
all_files= [i.split("/")[-1].split('.')[0] for i in glob.glob(f"{source_dir}/*.csv") ]
all_files ={}
my_dictfor site in list(config_ca.SiteCode):
# for each site and variable, append into the dict
if (config_ca[config_ca["SiteCode"]==site]["Tower"].item()) ==1 :
= [f for f in all_files if site in f and "upwind" not in f and "all" not in f and "co2" in f]
co2_files f"{site}-co2"] = co2_files
my_dict[# Find the files that do not have "upwind" or "all" and have "ch4"
= [f for f in all_files if site in f and "upwind" not in f and "all" not in f and "ch4" in f]
ch4_files f"{site}-ch4"] = ch4_files
my_dict[else:
= [f for f in all_files if site in f and "upwind" in f and "co2" in f]
co2_upwind_files f"{site}-co2"] = co2_upwind_files
my_dict[
# Find the files that have "upwind" and "ch4"
= [f for f in all_files if site in f and "upwind" in f and "ch4" in f]
ch4_upwind_files f"{site}-ch4"] = ch4_upwind_files
my_dict[
if site in ["IRV","RAN"]:
= [f for f in all_files if site in f and "all" in f and "co2" in f]
co2_files f"{site}-co2"] = co2_files
my_dict[= [f for f in all_files if site in f and "all" in f and "ch4" in f]
ch4_files f"{site}-ch4"] = ch4_files
my_dict[
del my_dict['USC2-co2']
del my_dict['USC2-ch4']
for key in my_dict:
= sorted(my_dict[key]) my_dict[key]
# code to generate transformed data for CA
= "output_LAM"
output_dir =True)
os.makedirs(output_dir,exist_okfor key, value in my_dict.items():
=pd.DataFrame()
df= key.split("-")[-1]
variable = f"{variable}_ppm" if variable == 'co2' else f"{variable}_ppb"
val = ["latitude","longitude","intake_height_m","elevation_m","datetime",val ]
columns for file in value:
= pd.read_csv(f"CA/{file}.csv")
tmp =[val], inplace=True)
tmp.dropna(subset={'datetime_UTC': 'datetime'}, inplace=True)
tmp.rename(columns= tmp[columns]
tmp={val: 'value'}, inplace=True)
tmp.rename(columns'datetime'] = pd.to_datetime(tmp['datetime'])
tmp['datetime'] = tmp['datetime'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
tmp['location'] = config_ca[config_ca['SiteCode']==site]["Location"].item()
tmp[= pd.concat([df, tmp], ignore_index=True)
df
'year']= df['datetime'].apply(lambda x: x[:4])
df[= df.groupby("year").agg(max_height= ("intake_height_m","max"))
result if result['max_height'].std() !=0:
print(f"More than one max height for {file}",result['max_height'].unique())
=pd.merge(df, result, on='year')
merged_df"is_max_height_data"]= merged_df["max_height"] == merged_df["intake_height_m"]
merged_df[=merged_df.drop(columns=['year','max_height'])
merged_df=True, inplace=True)
merged_df.reset_index(dropf"{output_dir}/NIST-testbed-LAM-{key}-hourly-concentrations.csv", index=False)
merged_df.to_csv(