import pandas as pd
import glob
import os
import warnings
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)Carbon Dioxide and Methane Concentrations from the Los Angeles Megacity Carbon Project
Documentation of data transformation
This script was used to transform the the Los Angeles Megacity Carbon Project dataset into meaningful csv files for ingestion to vector dataset.
# download data from https://data.nist.gov/od/id/mds2-2388 into your desired_folder
source_dir = "CA"# Grouping the files for preparation
config_ca = pd.read_csv("LAM_sites-2.csv") #metadata from providers
all_files= glob.glob(f"{source_dir}/*.csv")
all_files = [i.split("/")[-1].split('.')[0] for i in glob.glob(f"{source_dir}/*.csv") ]
my_dict={}
for site in list(config_ca.SiteCode):
# for each site and variable, append into the dict
if (config_ca[config_ca["SiteCode"]==site]["Tower"].item()) ==1 :
co2_files = [f for f in all_files if site in f and "upwind" not in f and "all" not in f and "co2" in f]
my_dict[f"{site}-co2"] = co2_files
# Find the files that do not have "upwind" or "all" and have "ch4"
ch4_files = [f for f in all_files if site in f and "upwind" not in f and "all" not in f and "ch4" in f]
my_dict[f"{site}-ch4"] = ch4_files
else:
co2_upwind_files = [f for f in all_files if site in f and "upwind" in f and "co2" in f]
my_dict[f"{site}-co2"] = co2_upwind_files
# Find the files that have "upwind" and "ch4"
ch4_upwind_files = [f for f in all_files if site in f and "upwind" in f and "ch4" in f]
my_dict[f"{site}-ch4"] = ch4_upwind_files
if site in ["IRV","RAN"]:
co2_files = [f for f in all_files if site in f and "all" in f and "co2" in f]
my_dict[f"{site}-co2"] = co2_files
ch4_files = [f for f in all_files if site in f and "all" in f and "ch4" in f]
my_dict[f"{site}-ch4"] = ch4_files
del my_dict['USC2-co2']
del my_dict['USC2-ch4']
for key in my_dict:
my_dict[key] = sorted(my_dict[key])# code to generate transformed data for CA
output_dir = "output_LAM"
os.makedirs(output_dir,exist_ok=True)
for key, value in my_dict.items():
df=pd.DataFrame()
variable = key.split("-")[-1]
val = f"{variable}_ppm" if variable == 'co2' else f"{variable}_ppb"
columns = ["latitude","longitude","intake_height_m","elevation_m","datetime",val ]
for file in value:
tmp = pd.read_csv(f"CA/{file}.csv")
tmp.dropna(subset=[val], inplace=True)
tmp.rename(columns={'datetime_UTC': 'datetime'}, inplace=True)
tmp= tmp[columns]
tmp.rename(columns={val: 'value'}, inplace=True)
tmp['datetime'] = pd.to_datetime(tmp['datetime'])
tmp['datetime'] = tmp['datetime'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
tmp['location'] = config_ca[config_ca['SiteCode']==site]["Location"].item()
df = pd.concat([df, tmp], ignore_index=True)
df['year']= df['datetime'].apply(lambda x: x[:4])
result = df.groupby("year").agg(max_height= ("intake_height_m","max"))
if result['max_height'].std() !=0:
print(f"More than one max height for {file}",result['max_height'].unique())
merged_df=pd.merge(df, result, on='year')
merged_df["is_max_height_data"]= merged_df["max_height"] == merged_df["intake_height_m"]
merged_df=merged_df.drop(columns=['year','max_height'])
merged_df.reset_index(drop=True, inplace=True)
merged_df.to_csv(f"{output_dir}/NIST-testbed-LAM-{key}-hourly-concentrations.csv", index=False)