Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ '3.8' ]
python-version: [ '3.11' ]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
Expand All @@ -22,7 +22,6 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install "numpy<1.19.0"
pip install -r test_requirements.txt
pip install pytest-cov
- name: Test with pytest
Expand Down
30 changes: 11 additions & 19 deletions aodntools/ncwriter/schema.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
"""This module holds schema definitions for validating the various :py:class:`dicts` that make up parts of a
template, and also the helper functions necessary to validate an object against their respective schema.
"""

import json

import numpy as np
from jsonschema import validators, Draft4Validator, FormatChecker, ValidationError
from pkg_resources import resource_filename

def is_array(checker, instance):
return isinstance(instance, (list, np.ndarray))

# Create a new validator class (based on Draft4Validator) to allow templates to use
# * Python types or numpy dtypes to specify variable data types; and
# * numpy arrays to specify variable data.
TemplateValidator = validators.create(meta_schema=Draft4Validator.META_SCHEMA,
validators=Draft4Validator.VALIDATORS)
format_checker = FormatChecker()
# Extend the default type checker by redefining "array"
custom_type_checker = Draft4Validator.TYPE_CHECKER.redefine("array", is_array)

# Create a custom validator that uses the new type checker.
CustomValidator = validators.extend(Draft4Validator, type_checker=custom_type_checker)
format_checker = FormatChecker()

@format_checker.checks('datatype')
def is_python_datatype(value):
Expand All @@ -24,32 +23,25 @@ def is_python_datatype(value):
return True
if isinstance(value, type):
return issubclass(value, np.number)

return False


TYPES = {'array': (list, np.ndarray)}

TEMPLATE_SCHEMA_JSON = resource_filename(__name__, 'template_schema.json')
with open(TEMPLATE_SCHEMA_JSON) as f:
TEMPLATE_SCHEMA = json.load(f)
TemplateValidator.check_schema(TEMPLATE_SCHEMA)

template_validator = TemplateValidator(TEMPLATE_SCHEMA, types=TYPES, format_checker=format_checker)
CustomValidator.check_schema(TEMPLATE_SCHEMA)

# Use the custom validator
template_validator = CustomValidator(TEMPLATE_SCHEMA, format_checker=format_checker)

def validate_template(t):
template_validator.validate(t)


def validate_dimensions(d):
validate_template({'_dimensions': d})


def validate_variables(v):
validate_template({'_variables': v})



def validate_global_attributes(a):
if hasattr(a, 'keys'):
special = [k for k in a.keys() if k.startswith('_')]
Expand Down
10 changes: 7 additions & 3 deletions aodntools/timeseries_products/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from datetime import datetime, timezone

import numpy as np
import xarray as xr

# Common date/time format strings
TIMESTAMP_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
Expand Down Expand Up @@ -179,7 +180,7 @@ def in_water_index(nc):
"""
time_deployment_start = np.datetime64(nc.attrs['time_deployment_start'][:-1])
time_deployment_end = np.datetime64(nc.attrs['time_deployment_end'][:-1])
TIME = nc['TIME'][:]
TIME = nc['TIME'].values
return (TIME >= time_deployment_start) & (TIME <= time_deployment_end)

def in_water(nc):
Expand All @@ -189,8 +190,11 @@ def in_water(nc):
:param nc: xarray dataset
:return: xarray dataset
"""
return nc.where(in_water_index(nc), drop=True)

condition = in_water_index(nc) # NumPy boolean array
# Get the integer indices where condition is True.
indices = np.nonzero(condition)[0]
# Use positional indexing to select the TIME entries that satisfy the condition.
return nc.isel(TIME=indices)

def current_utc_timestamp(format=TIMESTAMP_FORMAT):
return datetime.now(timezone.utc).strftime(format)
43 changes: 23 additions & 20 deletions aodntools/timeseries_products/hourly_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,27 +30,27 @@ def check_files(file_list, site_code, parameter_names_accepted, input_dir=''):
:param input_dir: base path where source files are stored
:return: dictionary with the file name and list of failed tests, list good files chronologically ordered
"""

file_list_dataframe = pd.DataFrame(columns=["url", "deployment_date"])
rows = []
error_dict = {}

for file in file_list:
with xr.open_dataset(os.path.join(input_dir, file)) as nc:
error_list = check_file(nc, site_code, parameter_names_accepted)
if error_list:
error_dict.update({file: error_list})
error_dict[file] = error_list
else:
file_list_dataframe = file_list_dataframe.append({'url': file,
'deployment_date': parse(nc.time_deployment_start)},
ignore_index=True)
rows.append({
'url': file,
'deployment_date': parse(nc.time_deployment_start)
})

file_list_dataframe = pd.DataFrame(rows, columns=["url", "deployment_date"])
file_list_dataframe = file_list_dataframe.sort_values(by='deployment_date')
file_list = file_list_dataframe['url'].to_list()
if file_list == []:
sorted_files = file_list_dataframe['url'].to_list()
if not sorted_files:
raise NoInputFilesError("no valid input files to aggregate")

return file_list, error_dict

return sorted_files, error_dict


def get_parameter_names(nc):
Expand Down Expand Up @@ -308,7 +308,7 @@ def PDresample_by_hour(df, function_dict, function_stats):
df_data = pd.DataFrame(index=pd.DatetimeIndex([]))
for variable in varnames:
ds_var = df[variable]
ds_var_resample = ds_var.resample('1H', base=0.5) # shift by half hour to centre bin on the hour
ds_var_resample = ds_var.resample('1h', offset='30min') # shift by half hour to centre bin on the hour
ds_var_mean = ds_var_resample.apply(function_dict[variable]).astype(np.float32)
df_data = pd.concat([df_data, ds_var_mean], axis=1, sort=False)
for stat_method in function_stats:
Expand Down Expand Up @@ -366,8 +366,6 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp
variable_attribute_dictionary = json.load(json_file)['_variables']

df_data = pd.DataFrame()


## create empty DF with dtypes
metadata_df_types = [('source_file', str),
('instrument_id', str),
Expand All @@ -380,6 +378,7 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp
parameter_names_all = []
applied_offset = []
qc_count_all = {}
metadata_rows = []

for file_index, file in enumerate(files_to_aggregate):
print(file_index)
Expand All @@ -398,13 +397,16 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp
qc_count = get_QCcount(nc_clean, qcflags)
qc_count_all = update_QCcount(qc_count_all, qc_count)
nc_clean = good_data_only(nc_clean, qcflags) # good quality data only
df_metadata = df_metadata.append({'source_file': file,
'instrument_id': utils.get_instrument_id(nc),
'LONGITUDE': nc.LONGITUDE.squeeze().values,
'LATITUDE': nc.LATITUDE.squeeze().values,
'NOMINAL_DEPTH': get_nominal_depth(nc)},
ignore_index=True)


# Append a new row as a dictionary to the list.
metadata_rows.append({
'source_file': file,
'instrument_id': utils.get_instrument_id(nc),
'LONGITUDE': nc.LONGITUDE.squeeze().values,
'LATITUDE': nc.LATITUDE.squeeze().values,
'NOMINAL_DEPTH': get_nominal_depth(nc)
})

# If TIME had out-of-range values before cleaning, nc_clean would now have a CFTimeIndex, which
# breaks the resampling further down. Here we reset it to a DatetimeIndex as suggested here:
# https://stackoverflow.com/questions/55786995/converting-cftime-datetimejulian-to-datetime/55787899#55787899
Expand All @@ -421,6 +423,7 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp
df_temp['instrument_index'] = np.repeat(file_index, len(df_temp)).astype(np.int32)
df_data = pd.concat([df_data, df_temp.reset_index()], ignore_index=True, sort=False)

df_metadata = pd.DataFrame(metadata_rows, columns=['source_file', 'instrument_id', 'LONGITUDE', 'LATITUDE', 'NOMINAL_DEPTH'])
df_metadata.index.rename('INSTRUMENT', inplace=True)
df_data.index.rename('OBSERVATION', inplace=True)
## rename index to TIME
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def append_resampled_values(nc_cell, ds, slice_start, binning_functions):
# shift the index forward 30min to centre the bins on the hour
df_cell.index = df_cell.index + pd.Timedelta(minutes=30)

df_cell_1H = df_cell.resample('1H')
df_cell_1H = df_cell.resample('1h')
slice_end = len(df_cell_1H) + slice_start

# set binned timestamps
Expand Down
4 changes: 0 additions & 4 deletions constraints.txt
Original file line number Diff line number Diff line change
@@ -1,4 +0,0 @@
cftime<1.1.1;python_version=='3.5'
netCDF4<1.5.4;python_version=='3.5'
pandas<0.25.0;python_version=='3.5'
xarray<0.14.0;python_version=='3.5'
3 changes: 2 additions & 1 deletion examples/rottnest.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@
var_type = var['_datatype']
for attr in ('valid_min', 'valid_max'):
if attr in var:
var[attr] = np.cast[var_type](var[attr])
var[attr] = np.array(var[attr], dtype=var_type)


# update range attributes
template.add_extent_attributes()
Expand Down
15 changes: 7 additions & 8 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from setuptools import setup, find_packages

INSTALL_REQUIRES = [
'jsonschema>=2.6.0,<3.0.0',
'numpy>=1.13.0',
'netCDF4>=1.5.3',
'pandas>=0.24.2',
'xarray>=0.11.3'
'jsonschema>=4.23.0',
'numpy>=2.2.4',
'netCDF4>=1.7.2',
'pandas>=2.2.3',
'xarray>=2023.1.0'
]

TESTS_REQUIRE = [
Expand Down Expand Up @@ -37,7 +37,7 @@
author_email='projectofficers@emii.org.au',
description='AODN data tools library',
zip_safe=False,
python_requires='>=3.5',
python_requires='>=3.11, <3.12',
install_requires=INSTALL_REQUIRES,
tests_require=TESTS_REQUIRE,
extras_require=EXTRAS_REQUIRE,
Expand All @@ -49,8 +49,7 @@
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.11',
'Programming Language :: Python :: Implementation :: CPython',
]
)