import pandas as pd
import glob
import os
import warnings
import subprocess
import tarfile
import warnings
import requests
warnings.filterwarnings("ignore", category=RuntimeWarning)Carbon Dioxide and Methane Concentrations from the Northeast Corridor (NEC) Urban Test Bed
Documentation of data transformation
This script was used to transform the Northeast Corridor (NEC) Urban Test Bed dataset into meaningful csv files for ingestion to vector dataset.
config = pd.read_csv("NEC_sites.csv") #https://data.nist.gov/od/id/mds2-3012# Code to download the files into csv folder
sites = list(config.SiteCode)
for SiteCode in config.SiteCode[:2]:
print(SiteCode)
download_link = f"https://data.nist.gov/od/ds/ark:/88434/mds2-3012/{SiteCode}.tgz"
# Check if the file exists on the server
response = requests.head(download_link)
if response.status_code != 404:
# File exists, proceed with download
result = subprocess.run(["wget", download_link, "-O", f"{SiteCode}.tgz"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL)
# Check if wget succeeded
if result.returncode == 0:
# Ensure the file is not empty
if os.path.getsize(f"{SiteCode}.tgz") > 0:
# Extract the files
with tarfile.open(f"{SiteCode}.tgz", "r:gz") as tar:
tar.extractall()
# Delete the .tgz file
os.remove(f"{SiteCode}.tgz")
else:
print(f"File {SiteCode}.tgz is empty.")
sites.remove(SiteCode)
os.remove(f"{SiteCode}.tgz") # Remove the empty file
else:
print(f"Failed to download {SiteCode}.tgz.")
sites.remove(SiteCode)
else:
print(f"File {SiteCode}.tgz does not exist on the server.")
sites.remove(SiteCode)sites = list(config.SiteCode)
# These are not available
sites.remove('AWS')
sites.remove('BVA')
sites.remove('DNC')variables = ['ch4','co2']
output_dir ="output_NEC"
os.makedirs(output_dir,exist_ok=True)for site in sites:
for variable in variables:
df = pd.DataFrame()
files = glob.glob(f"csv/{site}-*-{variable}-*.csv")
val = f"{variable}_ppm" if variable == 'co2' else f"{variable}_ppb"
for file in files:
tmp = pd.read_csv(file)
tmp.dropna(subset=[val], inplace=True)
tmp.rename(columns={'datetime_UTC': 'datetime'}, inplace=True)
columns = ["latitude","longitude","intake_height_m","elevation_m","datetime",val ]
tmp= tmp[columns]
tmp.rename(columns={val: 'value'}, inplace=True)
tmp['datetime'] = pd.to_datetime(tmp['datetime'])
tmp['datetime'] = tmp['datetime'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
tmp['location'] = config[config['SiteCode']==site]["Location"].item()
df = pd.concat([df, tmp], ignore_index=True)
df['year']= df['datetime'].apply(lambda x: x[:4])
result = df.groupby("year").agg(max_height= ("intake_height_m","max"))
if result['max_height'].std() !=0:
print(f"More than one max height for {file}",result['max_height'].unique())
merged_df=pd.merge(df, result, on='year')
merged_df["is_max_height_data"]= merged_df["max_height"] == merged_df["intake_height_m"]
merged_df=merged_df.drop(columns=['year','max_height'])
merged_df.reset_index(drop=True, inplace=True)
merged_df.to_csv(f"{output_dir}/NIST-testbed-NEC-{site}-{variable}-hourly-concentrations.csv", index=False)