Skip to content

Add test file for TimeBinner obsfunction#553

Open
ReubenHill wants to merge 5 commits intodevelopfrom
feature/timebinner
Open

Add test file for TimeBinner obsfunction#553
ReubenHill wants to merge 5 commits intodevelopfrom
feature/timebinner

Conversation

@ReubenHill
Copy link
Copy Markdown
Contributor

@ReubenHill ReubenHill commented Mar 25, 2026

Description

Test file for https://github.com/JCSDA-internal/ufo/pull/4048 . The file was generated with the below python script:

import netCDF4 as nc
import numpy as np
from datetime import datetime as dt, timezone as tz


MISSING_FLOAT = -3.3687952621450501176e38  # jedi's missing float value
MISSING_STR = "MISSING*"  # jedi's missing string value
# missing int_64 is lowest possible int_64 value + 7
MISSING_INT64 = -(2**63) + 7
MISSING_INT32 = -(2**31) + 5
MISSING_DATETIME = MISSING_INT64  # How it's ultimately represented in outputs


def time_binner_test_files_gen(
    name,
    lat,
    lon,
    datetime,
    station_id,
    about_hour_minus_30m_timestamps,
    about_hour_minus_30m_bin_numbers,
    excluding_hour_plus_1hour_timestamps,
    excluding_hour_plus_1hour_bin_numbers,
    about_hour_minus_30m_plus_29m59s_timestamps,
    about_hour_minus_30m_plus_29m59s_bin_numbers,
    reverse_about_hour_minus_30m_plus_29m59s_bin_numbers,
    reverse_about_hour_minus_30m_bin_numbers,
    reverse_about_hour_plus_30m_timestamps,
    reverse_about_hour_plus_30m_bin_numbers,
    about_day_minus_12h_timestamps,
    about_day_minus_12h_bin_numbers,
    about_minute_minus_30s_plus_10s_timestamps,
    about_minute_minus_30s_plus_10s_bin_numbers,
):
    """
    Generate a NetCDF4 file for testing the TimeBinner UFO ObsFunction.

    Parameters:
        name (str): Name of the output NetCDF4 file.
        lat (np.ndarray): Array of latitudes.
        lon (np.ndarray): Array of longitudes.
        datetime (np.ndarray): Array of observation datetimes (seconds since epoch).
        station_id (np.ndarray): Array of station identifications.
        about_hour_minus_30m_timestamps (np.ndarray): Binned datetimes for "about hour minus 30 minutes" test.
        about_hour_minus_30m_bin_numbers (np.ndarray): Bin numbers for "about hour minus 30 minutes" test.
        excluding_hour_plus_1hour_timestamps (np.ndarray): Binned datetimes for "excluding hour plus 1 hour" test.
        excluding_hour_plus_1hour_bin_numbers (np.ndarray): Bin numbers for "excluding hour plus 1 hour" test.
        about_hour_minus_30m_plus_29m59s_timestamps (np.ndarray): Binned datetimes for "about hour minus 30 minutes plus 29m59s" test.
        about_hour_minus_30m_plus_29m59s_bin_numbers (np.ndarray): Bin numbers for "about hour minus 30 minutes plus 29m59s" test.
        reverse_about_hour_minus_30m_plus_29m59s_bin_numbers (np.ndarray): Bin numbers for reverse chronological "about hour minus 30 minutes plus 29m59s" test.
        reverse_about_hour_minus_30m_bin_numbers (np.ndarray): Bin numbers for reverse chronological "about hour minus 30 minutes" test.
        reverse_about_hour_plus_30m_timestamps (np.ndarray): Binned datetimes for reverse chronological "about hour plus 30 minutes" test.
        reverse_about_hour_plus_30m_bin_numbers (np.ndarray): Bin numbers for reverse chronological "about hour plus 30 minutes" test.
        about_day_minus_12h_timestamps (np.ndarray): Binned datetimes for "about day minus 12 hours" test.
        about_day_minus_12h_bin_numbers (np.ndarray): Bin numbers for "about day minus 12 hours" test.
        about_minute_minus_30s_plus_10s_timestamps (np.ndarray): Binned datetimes for "about minute minus 30 seconds plus 10 seconds" test.
        about_minute_minus_30s_plus_10s_bin_numbers (np.ndarray): Bin numbers for "about minute minus 30 seconds plus 10 seconds" test.
    """
    file = nc.Dataset(name, "w", format="NETCDF4")

    # Dimensions
    location_dim = file.createDimension("Location", len(datetime))

    # Variables
    lat_var = file.createVariable("MetaData/latitude", "f4", ("Location"), fill_value=MISSING_FLOAT)
    lat_var[:] = lat
    lon_var = file.createVariable("MetaData/longitude", "f4", ("Location"), fill_value=MISSING_FLOAT)
    lon_var[:] = lon
    datetime_var = file.createVariable("MetaData/dateTime", "i8", ("Location"), fill_value=MISSING_DATETIME)
    datetime_var.units = "seconds since 1970-01-01T00:00:00Z"
    datetime_var[:] = datetime
    station_id_var = file.createVariable(
        "MetaData/stationIdentification", str, ("Location")
    )
    station_id_var[:] = station_id

    # Add some temperatures in case they need to be there for the file to be read
    temp_var = file.createVariable("ObsValue/airTemperature", "f4", ("Location"), fill_value=MISSING_FLOAT)
    temp_var[:] = 280.0 + 10.0 * np.sin(np.linspace(0, 10 * np.pi, len(datetime)))

    about_hour_minus_30m_timestamps_var = file.createVariable(
        "TestReference/aboutHourMinus30MTimestamps", "i8", ("Location"), fill_value=MISSING_DATETIME
    )
    about_hour_minus_30m_timestamps_var[:] = about_hour_minus_30m_timestamps
    about_hour_minus_30m_bin_numbers_var = file.createVariable(
        "TestReference/aboutHourMinus30MBinNumbers", "i8", ("Location"), fill_value=MISSING_INT64
    )
    about_hour_minus_30m_bin_numbers_var[:] = about_hour_minus_30m_bin_numbers
    excluding_hour_plus_1hour_timestamps_var = file.createVariable(
        "TestReference/excludingHourPlus1HourBinTimestamps", "i8", ("Location"), fill_value=MISSING_DATETIME
    )
    excluding_hour_plus_1hour_timestamps_var[:] = excluding_hour_plus_1hour_timestamps
    excluding_hour_plus_1hour_bin_numbers_var = file.createVariable(
        "TestReference/excludingHourPlus1HourBinNumbers", "i8", ("Location"), fill_value=MISSING_INT64
    )
    excluding_hour_plus_1hour_bin_numbers_var[:] = excluding_hour_plus_1hour_bin_numbers
    about_hour_minus_30m_plus_29m59s_timestamps_var = file.createVariable(
        "TestReference/aboutHourMinus30MPlus29M59STimestamps", "i8", ("Location"), fill_value=MISSING_DATETIME
    )
    about_hour_minus_30m_plus_29m59s_timestamps_var[:] = (
        about_hour_minus_30m_plus_29m59s_timestamps
    )
    about_hour_minus_30m_plus_29m59s_bin_numbers_var = file.createVariable(
        "TestReference/aboutHourMinus30MPlus29M59SBinNumbers", "i8", ("Location"), fill_value=MISSING_INT64
    )
    about_hour_minus_30m_plus_29m59s_bin_numbers_var[:] = (
        about_hour_minus_30m_plus_29m59s_bin_numbers
    )
    reverse_about_hour_minus_30m_plus_29m59s_bin_numbers_var = file.createVariable(
        "TestReference/reverseAboutHourMinus30MPlus29M59SBinNumbers", "i8", ("Location"), fill_value=MISSING_INT64
    )
    reverse_about_hour_minus_30m_plus_29m59s_bin_numbers_var[:] = (
        reverse_about_hour_minus_30m_plus_29m59s_bin_numbers
    )
    reverse_about_hour_minus_30m_bin_numbers_var = file.createVariable(
        "TestReference/reverseAboutHourMinus30MBinNumbers", "i8", ("Location"), fill_value=MISSING_INT64
    )
    reverse_about_hour_minus_30m_bin_numbers_var[:] = (
        reverse_about_hour_minus_30m_bin_numbers
    )
    reverse_about_hour_plus_30m_timestamps_var = file.createVariable(
        "TestReference/reverseAboutHourPlus30MTimestamps", "i8", ("Location"), fill_value=MISSING_DATETIME
    )
    reverse_about_hour_plus_30m_timestamps_var[:] = reverse_about_hour_plus_30m_timestamps
    reverse_about_hour_plus_30m_bin_numbers_var = file.createVariable(
        "TestReference/reverseAboutHourPlus30MBinNumbers", "i8", ("Location"), fill_value=MISSING_INT64
    )
    reverse_about_hour_plus_30m_bin_numbers_var[:] = reverse_about_hour_plus_30m_bin_numbers
    about_day_minus_12h_timestamps_var = file.createVariable(
        "TestReference/aboutMidnightMinus12HTimestamps", "i8", ("Location"), fill_value=MISSING_DATETIME
    )
    about_day_minus_12h_timestamps_var[:] = about_day_minus_12h_timestamps
    about_day_minus_12h_bin_numbers_var = file.createVariable(
        "TestReference/aboutMidnightMinus12HBinNumbers", "i8", ("Location"), fill_value=MISSING_INT64
    )
    about_day_minus_12h_bin_numbers_var[:] = about_day_minus_12h_bin_numbers
    about_minute_minus_30s_plus_10s_timestamps_var = file.createVariable(
        "TestReference/aboutMinuteMinus30SPlus10STimestamps", "i8", ("Location"), fill_value=MISSING_DATETIME
    )
    about_minute_minus_30s_plus_10s_timestamps_var[:] = (
        about_minute_minus_30s_plus_10s_timestamps
    )
    about_minute_minus_30s_plus_10s_bin_numbers_var = file.createVariable(
        "TestReference/aboutMinuteMinus30SPlus10SBinNumbers", "i8", ("Location"), fill_value=MISSING_INT64
    )
    about_minute_minus_30s_plus_10s_bin_numbers_var[:] = (
        about_minute_minus_30s_plus_10s_bin_numbers
    )
    file.close()


def bin_datetimes_with_window(
    datetimes,
    bin_centers,
    window_start_offset,
    window_end_offset,
    missing_value=MISSING_DATETIME,
    missing_bin_number=MISSING_INT32,
):
    """
    Bin a NumPy array of datetimes (seconds since epoch) into time windows based on multiple bin centers.
    The entire window (defined by start and end offsets) is treated as a single bin.

    Parameters:
        datetimes (np.ndarray): Array of datetimes (seconds since epoch).
        bin_centers (np.ndarray): Array of bin centers (seconds since epoch).
        window_start_offset (int): Inclusive offset from each bin center to the start of the window (in seconds).
        window_end_offset (int): Inclusive offset from each bin center to the end of the window (in seconds).
        missing_value (int): Value to assign to datetimes outside all windows.
        missing_bin_number (int): Bin number to assign to datetimes outside all windows.

    Returns:
        tuple: (binned_datetimes, bin_numbers)
            - binned_datetimes (np.ndarray): Array of binned datetimes (set to the bin center for values in the window).
            - bin_numbers (np.ndarray): Array of bin numbers, normalized so the lowest non-negative bin number is 0.
    """
    # Initialize arrays for binned datetimes and bin numbers
    binned_datetimes = np.full_like(datetimes, missing_value, dtype=np.int64)
    bin_numbers = np.full_like(datetimes, missing_bin_number, dtype=np.int64)

    # Iterate over each bin center
    for bin_idx, bin_center in enumerate(bin_centers):
        # Calculate the start and end of the binning window for this center
        window_start = bin_center + window_start_offset
        window_end = bin_center + window_end_offset

        # Mask for values within the window
        within_window = (datetimes >= window_start) & (datetimes <= window_end)

        # Assign the bin center to datetimes within the window
        binned_datetimes[within_window] = bin_center

        # Assign the bin number to datetimes within the window
        bin_numbers[within_window] = bin_idx

    # Normalize bin numbers so the lowest non-negative bin number is 0 - the
    # ObsFunction will give the first timestamp it finds which is in a bin the
    # number 0, so we need to ensure that the lowest bin number is 0
    non_negative_bins = bin_numbers[bin_numbers >= 0]
    if len(non_negative_bins) > 0:
        min_bin_number = np.min(non_negative_bins)
        bin_numbers[bin_numbers >= 0] -= min_bin_number

    return binned_datetimes, bin_numbers


if __name__ == "__main__":
    # Generate files for testing the TimeBinner UFO ObsFunction

    # datetimes are in seconds since 1970-01-01 00:00:00 UTC

    # Create observation datetimes spaced by `obs_spacing` seconds starting at
    # 2018-04-14 01:00:00 UTC up to 2018-04-17 01:20:00 UTC (3 days) for `num_locations` locations
    num_locations = 3
    num_days = 3
    obs_per_hour = 15  # Every 4 mins - adjust as necessary
    obs_spacing = 3600 // obs_per_hour  # Time spacing between observations in seconds
    num_obs_per_location = num_days * 24 * obs_per_hour
    num_obs = num_locations * num_obs_per_location
    start_time = int(dt(2018, 4, 14, 1, 0, 0, tzinfo=tz.utc).timestamp())
    datetime = np.asarray(
        [start_time + i * obs_spacing for i in range(num_obs_per_location)]
        * num_locations
    )
    station_id = np.asarray(
        [
            f"Station{i}"
            for i in range(num_locations)
            for _ in range(num_obs_per_location)
        ]
    )

    # Create latitudes and longitudes for `num_locations`
    assert num_locations < 180  # to avoid going outside valid lat/lon ranges
    lat = np.asarray(
        [0.0 + i for i in range(num_locations) for _ in range(num_obs_per_location)]
    )
    lon = np.asarray(
        [0.0 + i for i in range(num_locations) for _ in range(num_obs_per_location)]
    )

    # Create bin centers every 1 hour from 2018-04-14 01:00:00 UTC to
    # 2018-04-17 01:00:00 UTC
    first_bin_center = start_time  # 2018-04-14 01:00:00 UTC
    hourly_bin_centers = np.asarray(
        [
            first_bin_center + 3600 * i
            for i in range(num_days * 24 + 1)  # + 1 to avoid 0 and include last hour
        ]
    )
    about_hour_minus_30m_timestamps, about_hour_minus_30m_bin_numbers = (
        bin_datetimes_with_window(
            datetimes=datetime,
            bin_centers=hourly_bin_centers,
            window_start_offset=-1800,  # 30 minutes before bin center
            window_end_offset=0,  # up to bin center
        )
    )
    (
        excluding_hour_plus_1hour_timestamps,
        excluding_hour_plus_1hour_bin_numbers,
    ) = bin_datetimes_with_window(
        datetimes=datetime,
        bin_centers=hourly_bin_centers,
        window_start_offset=1,  # 1 second after the bin center
        window_end_offset=3600,  # 1 hour after the bin center
    )
    (
        about_hour_minus_30m_plus_29m59s_timestamps,
        about_hour_minus_30m_plus_29m59s_bin_numbers,
    ) = bin_datetimes_with_window(
        datetimes=datetime,
        bin_centers=hourly_bin_centers,
        window_start_offset=-1800,  # 30 minutes before bin center
        window_end_offset=1799,  # up to 29m59s after bin center
    )
    # Put bin centers in reverse chronological order to test reverse ordering
    # of bin numbers
    reverse_chronological_bin_centers = np.flip(hourly_bin_centers)
    (
        reverse_about_hour_minus_30m_plus_29m59s_timestamps,
        reverse_about_hour_minus_30m_plus_29m59s_bin_numbers,
    ) = bin_datetimes_with_window(
        datetimes=datetime,
        bin_centers=reverse_chronological_bin_centers,
        window_start_offset=-1800,  # 30 minutes before bin center
        window_end_offset=1799,  # up to 29m59s after bin center
    )
    assert np.array_equal(
        about_hour_minus_30m_plus_29m59s_timestamps,
        reverse_about_hour_minus_30m_plus_29m59s_timestamps,
    )
    # Try reverse ordering with offset windows too
    (
        reverse_about_hour_minus_30m_timestamps,
        reverse_about_hour_minus_30m_bin_numbers,
    ) = bin_datetimes_with_window(
        datetimes=datetime,
        bin_centers=reverse_chronological_bin_centers,
        window_start_offset=-1800,  # 10 minutes before bin center
        window_end_offset=0,  # at bin center
    )
    assert np.array_equal(
        about_hour_minus_30m_timestamps,
        reverse_about_hour_minus_30m_timestamps,
    )
    (
        reverse_about_hour_plus_30m_timestamps,
        reverse_about_hour_plus_30m_bin_numbers,
    ) = bin_datetimes_with_window(
        datetimes=datetime,
        bin_centers=reverse_chronological_bin_centers,
        window_start_offset=0,  # at bin center
        window_end_offset=1800,  # 30 minutes after bin center
    )
    # Try bin centres about day too where "day" is midnight UTC (derived from
    # first identified datetime - i.e. 2018-04-14 01:00:00 becomes 2018-04-14
    # 00:00:00 UTC)
    first_bin_center = int(dt(2018, 4, 14, 0, 0, 0, tzinfo=tz.utc).timestamp())
    daily_bin_centers = np.asarray(
        [
            first_bin_center + 86400 * i
            for i in range(num_days + 1)  # + 1 to include last day
        ]
    )
    about_day_minus_12h_timestamps, about_day_minus_12h_bin_numbers = (
        bin_datetimes_with_window(
            datetimes=datetime,
            bin_centers=daily_bin_centers,
            window_start_offset=-43200,  # 12 hours before bin center
            window_end_offset=0,  # up to bin center
        )
    )
    # And about minute too (derived first bin center is 2018-04-14 01:00:00 UTC)
    first_bin_center = int(dt(2018, 4, 14, 1, 0, 0, tzinfo=tz.utc).timestamp())
    minute_bin_centers = np.asarray(
        [
            first_bin_center + 60 * i
            for i in range(
                (num_days * 24 * 60) + 1
            )  # + 1 to avoid 0 and include last minute
        ]
    )
    (
        about_minute_minus_30s_plus_10s_timestamps,
        about_minute_minus_30s_plus_10s_bin_numbers,
    ) = bin_datetimes_with_window(
        datetimes=datetime,
        bin_centers=minute_bin_centers,
        window_start_offset=-30,  # 30 seconds before bin center
        window_end_offset=10,  # up to 10 seconds after bin center
    )
    # permute the obs types to make sure that the code does not rely on the
    # ordering of the observation type numbers
    permutation = np.random.permutation(num_obs)
    # permutation = np.arange(num_obs)  # No permutation for easier debugging
    time_binner_test_files_gen(
        name="time_binner_test_obs.nc4",
        lat=lat[permutation],
        lon=lon[permutation],
        datetime=datetime[permutation],
        station_id=station_id[permutation],
        about_hour_minus_30m_timestamps=about_hour_minus_30m_timestamps[permutation],
        about_hour_minus_30m_bin_numbers=about_hour_minus_30m_bin_numbers[permutation],
        excluding_hour_plus_1hour_timestamps=excluding_hour_plus_1hour_timestamps[
            permutation
        ],
        excluding_hour_plus_1hour_bin_numbers=excluding_hour_plus_1hour_bin_numbers[
            permutation
        ],
        about_hour_minus_30m_plus_29m59s_timestamps=about_hour_minus_30m_plus_29m59s_timestamps[
            permutation
        ],
        about_hour_minus_30m_plus_29m59s_bin_numbers=about_hour_minus_30m_plus_29m59s_bin_numbers[
            permutation
        ],
        reverse_about_hour_minus_30m_plus_29m59s_bin_numbers=reverse_about_hour_minus_30m_plus_29m59s_bin_numbers[
            permutation
        ],
        reverse_about_hour_minus_30m_bin_numbers=reverse_about_hour_minus_30m_bin_numbers[
            permutation],
        reverse_about_hour_plus_30m_timestamps=reverse_about_hour_plus_30m_timestamps[
            permutation],
        reverse_about_hour_plus_30m_bin_numbers=reverse_about_hour_plus_30m_bin_numbers[
            permutation],
        about_day_minus_12h_timestamps=about_day_minus_12h_timestamps[permutation],
        about_day_minus_12h_bin_numbers=about_day_minus_12h_bin_numbers[permutation],
        about_minute_minus_30s_plus_10s_timestamps=about_minute_minus_30s_plus_10s_timestamps[
            permutation
        ],
        about_minute_minus_30s_plus_10s_bin_numbers=about_minute_minus_30s_plus_10s_bin_numbers[
            permutation
        ],
    )

Issue(s) addressed

N/A

Dependencies

List the other PRs that this PR is dependent on:

build-group=https://github.com/JCSDA-internal/ufo/pull/4048

@ReubenHill ReubenHill requested review from ctgh and mikecooke77 March 27, 2026 16:36
@ReubenHill ReubenHill changed the title Feature/timebinner Add test file for TimeBinner obsfunction Apr 7, 2026
@mikecooke77
Copy link
Copy Markdown
Collaborator

Could you modify your script so that the missing_float value is assigned as the missing value in the netcdf e.g.

lat_var = file.createVariable("MetaData/latitude", "f4", ("Location"), fill_value=MISSING_FLOAT)

@ReubenHill
Copy link
Copy Markdown
Contributor Author

Could you modify your script so that the missing_float value is assigned as the missing value in the netcdf e.g.

lat_var = file.createVariable("MetaData/latitude", "f4", ("Location"), fill_value=MISSING_FLOAT)

Done - I've updated the description with the python code too

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants