diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 666cde4b..3f857386 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -51,7 +51,7 @@ repos: h5py>=2.10.0, wheel>=0.33.1, numpy<2.0.0, - pandas>=1.1.2, + 'pandas>=1.1.2,<3.0.0', python-dateutil>=2.7.5, pytz>=2020.1, pyarrow>=1.0.1, diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index 55d2ea68..c5607cf9 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -731,6 +731,11 @@ def test_categorical_diff(self): }, } actual_diff = profile.diff(profile2) + self.assertAlmostEqual( + expected_diff.get("statistics").get("chi2-test").pop("p-value"), + actual_diff.get("statistics").get("chi2-test").pop("p-value"), + places=10, + ) self.assertDictEqual(expected_diff, actual_diff) # Test with one categorical column matching diff --git a/dataprofiler/tests/profilers/test_column_profile_compilers.py b/dataprofiler/tests/profilers/test_column_profile_compilers.py index 1e0afc12..2617ccc6 100644 --- a/dataprofiler/tests/profilers/test_column_profile_compilers.py +++ b/dataprofiler/tests/profilers/test_column_profile_compilers.py @@ -256,6 +256,19 @@ def test_diff_primitive_compilers(self): profile_diff["statistics"].pop("median_absolute_deviation"), places=2, ) + self.assertAlmostEqual( + expected_diff["statistics"].get("t-test").get("welch").pop("p-value"), + profile_diff["statistics"].get("t-test").get("welch").pop("p-value"), + places=10, + ) + self.assertAlmostEqual( + expected_diff["statistics"] + .get("t-test") + .get("conservative") + .pop("p-value"), + profile_diff["statistics"].get("t-test").get("conservative").pop("p-value"), + places=10, + ) self.assertDictEqual(expected_diff, profile_diff) # Test different compilers @@ -354,6 +367,22 @@ def test_disabling_columns_during_primitive_diff(self): profile_diff["statistics"].pop("median_absolute_deviation"), places=2, ) + self.assertAlmostEqual( + expected_diff.get("statistics").get("t-test").get("welch").pop("p-value"), + profile_diff.get("statistics").get("t-test").get("welch").pop("p-value"), + places=10, + ) + self.assertAlmostEqual( + expected_diff.get("statistics") + .get("t-test") + .get("conservative") + .pop("p-value"), + profile_diff.get("statistics") + .get("t-test") + .get("conservative") + .pop("p-value"), + places=10, + ) self.assertDictEqual(expected_diff, profile_diff) # Test disabling all columns in one compiler diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index d79fdd64..e3f93ed3 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -1728,6 +1728,11 @@ def test_diff(self): np.testing.assert_almost_equal( sorted(expected_diff_mode[i]), sorted(diff_mode[i]), 2 ) + self.assertAlmostEqual( + expected_diff.get("t-test").get("welch").pop("p-value"), + profile_diff.get("t-test").get("welch").pop("p-value"), + places=10, + ) self.assertAlmostEqual( expected_diff.pop("median_absolute_deviation"), profile_diff.pop("median_absolute_deviation"), diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py index 961b33c8..960e5318 100644 --- a/dataprofiler/tests/profilers/test_int_column_profile.py +++ b/dataprofiler/tests/profilers/test_int_column_profile.py @@ -1087,6 +1087,11 @@ def test_diff(self): np.testing.assert_almost_equal( sorted(expected_diff_mode[i]), sorted(diff_mode[i]), 2 ) + self.assertAlmostEqual( + expected_diff.get("t-test").get("welch").pop("p-value"), + profile_diff.get("t-test").get("welch").pop("p-value"), + places=10, + ) self.assertAlmostEqual( expected_diff.pop("median_absolute_deviation"), profile_diff.pop("median_absolute_deviation"), diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index c4e60473..0c48051b 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -2156,9 +2156,13 @@ def test_diff_categorical_chi2_test(self, *mocks): "deg_of_free": 2, "p-value": 0.3099238764710244, } - self.assertDictEqual( - expected_chi2_test_dict, diff["data_stats"][0]["statistics"]["chi2-test"] + chi2_diff = diff["data_stats"][0]["statistics"]["chi2-test"] + self.assertAlmostEqual( + expected_chi2_test_dict.pop("p-value"), + chi2_diff.pop("p-value"), + places=10, ) + self.assertDictEqual(expected_chi2_test_dict, chi2_diff) @mock.patch( "dataprofiler.profilers.data_labeler_column_profile.DataLabelerColumn.update" diff --git a/dataprofiler/tests/profilers/test_text_column_profile.py b/dataprofiler/tests/profilers/test_text_column_profile.py index 12fb1d27..f45d5c77 100644 --- a/dataprofiler/tests/profilers/test_text_column_profile.py +++ b/dataprofiler/tests/profilers/test_text_column_profile.py @@ -607,6 +607,11 @@ def test_diff(self): np.testing.assert_almost_equal( sorted(expected_diff_mode[i]), sorted(diff_mode[i]), 2 ) + self.assertAlmostEqual( + expected_diff.get("t-test").get("welch").pop("p-value"), + profile_diff.get("t-test").get("welch").pop("p-value"), + places=10, + ) self.assertAlmostEqual( expected_diff.pop("median_absolute_deviation"), profile_diff.pop("median_absolute_deviation"), diff --git a/requirements.txt b/requirements.txt index e32f3285..1036c433 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,11 @@ h5py>=2.10.0 wheel>=0.33.1 numpy<2.0.0 -pandas>=1.1.2 +pandas>=1.1.2,<3.0.0 python-dateutil>=2.7.5 pytz>=2020.1 pyarrow>=1.0.1 -chardet>=3.0.4 +chardet>=3.0.4,<7.0.0 fastavro>=1.1.0 python-snappy>=0.7.1 charset-normalizer>=1.3.6