Carbon Dioxide and Methane Concentrations from the Northeast Corridor (NEC) Urban Test Bed

Documentation of data transformation

Author

Paridhi Parajuli

Published

September 19, 2024

This script was used to transform the Northeast Corridor (NEC) Urban Test Bed dataset into meaningful csv files for ingestion to vector dataset.

import pandas as pd
import glob
import os
import warnings
import subprocess
import tarfile
import warnings 
import requests
warnings.filterwarnings("ignore", category=RuntimeWarning)

config = pd.read_csv("NEC_sites.csv")  #https://data.nist.gov/od/id/mds2-3012

# Code to download the files into csv folder 
sites = list(config.SiteCode)
for SiteCode in config.SiteCode[:2]:
    print(SiteCode)
    download_link = f"https://data.nist.gov/od/ds/ark:/88434/mds2-3012/{SiteCode}.tgz"
    
    # Check if the file exists on the server
    response = requests.head(download_link)
    if response.status_code != 404:
        # File exists, proceed with download
        result = subprocess.run(["wget", download_link, "-O", f"{SiteCode}.tgz"], 
                                stdout=subprocess.DEVNULL,
                                stderr=subprocess.DEVNULL)

        # Check if wget succeeded
        if result.returncode == 0:
            # Ensure the file is not empty
            if os.path.getsize(f"{SiteCode}.tgz") > 0:
                # Extract the files
                with tarfile.open(f"{SiteCode}.tgz", "r:gz") as tar:
                    tar.extractall()

                # Delete the .tgz file
                os.remove(f"{SiteCode}.tgz")
            else:
                print(f"File {SiteCode}.tgz is empty.")
                sites.remove(SiteCode)
                os.remove(f"{SiteCode}.tgz")  # Remove the empty file
        else:
            print(f"Failed to download {SiteCode}.tgz.")
            sites.remove(SiteCode)
    else:
        print(f"File {SiteCode}.tgz does not exist on the server.")
        sites.remove(SiteCode)

sites = list(config.SiteCode)
# These are not available
sites.remove('AWS')
sites.remove('BVA')
sites.remove('DNC')

variables = ['ch4','co2']
output_dir ="output_NEC"
os.makedirs(output_dir,exist_ok=True)

for site in sites:
    for variable in variables:
        df = pd.DataFrame()
        files = glob.glob(f"csv/{site}-*-{variable}-*.csv")
        val = f"{variable}_ppm" if variable == 'co2' else f"{variable}_ppb"
        for file in files:
            tmp = pd.read_csv(file)
            tmp.dropna(subset=[val], inplace=True)
            tmp.rename(columns={'datetime_UTC': 'datetime'}, inplace=True)
            columns = ["latitude","longitude","intake_height_m","elevation_m","datetime",val ]
            tmp= tmp[columns]
            tmp.rename(columns={val: 'value'}, inplace=True)
            tmp['datetime'] = pd.to_datetime(tmp['datetime'])
            tmp['datetime'] = tmp['datetime'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
            tmp['location'] = config[config['SiteCode']==site]["Location"].item()
            df = pd.concat([df, tmp], ignore_index=True)
            
        df['year']= df['datetime'].apply(lambda x: x[:4])
        result = df.groupby("year").agg(max_height= ("intake_height_m","max"))
        if result['max_height'].std() !=0:
            print(f"More than one max height for {file}",result['max_height'].unique())
        merged_df=pd.merge(df, result, on='year')
        merged_df["is_max_height_data"]= merged_df["max_height"] == merged_df["intake_height_m"]
        merged_df=merged_df.drop(columns=['year','max_height'])
        merged_df.reset_index(drop=True, inplace=True)
        merged_df.to_csv(f"{output_dir}/NIST-testbed-NEC-{site}-{variable}-hourly-concentrations.csv", index=False)