import pandas as pd
import glob
import os
import warnings
import subprocess
import tarfile
import warnings
import requests
"ignore", category=RuntimeWarning) warnings.filterwarnings(
Carbon Dioxide and Methane Concentrations from the Northeast Corridor (NEC) Urban Test Bed
Documentation of data transformation
This script was used to transform the Northeast Corridor (NEC) Urban Test Bed dataset into meaningful csv files for ingestion to vector dataset.
= pd.read_csv("NEC_sites.csv") #https://data.nist.gov/od/id/mds2-3012 config
# Code to download the files into csv folder
= list(config.SiteCode)
sites for SiteCode in config.SiteCode[:2]:
print(SiteCode)
= f"https://data.nist.gov/od/ds/ark:/88434/mds2-3012/{SiteCode}.tgz"
download_link
# Check if the file exists on the server
= requests.head(download_link)
response if response.status_code != 404:
# File exists, proceed with download
= subprocess.run(["wget", download_link, "-O", f"{SiteCode}.tgz"],
result =subprocess.DEVNULL,
stdout=subprocess.DEVNULL)
stderr
# Check if wget succeeded
if result.returncode == 0:
# Ensure the file is not empty
if os.path.getsize(f"{SiteCode}.tgz") > 0:
# Extract the files
with tarfile.open(f"{SiteCode}.tgz", "r:gz") as tar:
tar.extractall()
# Delete the .tgz file
f"{SiteCode}.tgz")
os.remove(else:
print(f"File {SiteCode}.tgz is empty.")
sites.remove(SiteCode)f"{SiteCode}.tgz") # Remove the empty file
os.remove(else:
print(f"Failed to download {SiteCode}.tgz.")
sites.remove(SiteCode)else:
print(f"File {SiteCode}.tgz does not exist on the server.")
sites.remove(SiteCode)
= list(config.SiteCode)
sites # These are not available
'AWS')
sites.remove('BVA')
sites.remove('DNC') sites.remove(
= ['ch4','co2']
variables ="output_NEC"
output_dir =True) os.makedirs(output_dir,exist_ok
for site in sites:
for variable in variables:
= pd.DataFrame()
df = glob.glob(f"csv/{site}-*-{variable}-*.csv")
files = f"{variable}_ppm" if variable == 'co2' else f"{variable}_ppb"
val for file in files:
= pd.read_csv(file)
tmp =[val], inplace=True)
tmp.dropna(subset={'datetime_UTC': 'datetime'}, inplace=True)
tmp.rename(columns= ["latitude","longitude","intake_height_m","elevation_m","datetime",val ]
columns = tmp[columns]
tmp={val: 'value'}, inplace=True)
tmp.rename(columns'datetime'] = pd.to_datetime(tmp['datetime'])
tmp['datetime'] = tmp['datetime'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
tmp['location'] = config[config['SiteCode']==site]["Location"].item()
tmp[= pd.concat([df, tmp], ignore_index=True)
df
'year']= df['datetime'].apply(lambda x: x[:4])
df[= df.groupby("year").agg(max_height= ("intake_height_m","max"))
result if result['max_height'].std() !=0:
print(f"More than one max height for {file}",result['max_height'].unique())
=pd.merge(df, result, on='year')
merged_df"is_max_height_data"]= merged_df["max_height"] == merged_df["intake_height_m"]
merged_df[=merged_df.drop(columns=['year','max_height'])
merged_df=True, inplace=True)
merged_df.reset_index(dropf"{output_dir}/NIST-testbed-NEC-{site}-{variable}-hourly-concentrations.csv", index=False) merged_df.to_csv(