From 1393daa8c0b0a7ec878e9bd8d70cf2858e5253c8 Mon Sep 17 00:00:00 2001 From: JGSweets Date: Wed, 4 Mar 2026 17:25:18 -0600 Subject: [PATCH 1/6] Update data_utils.py --- dataprofiler/data_readers/data_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 611d25dc..80296f16 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -29,7 +29,7 @@ import pandas as pd import pyarrow.parquet as pq import requests -from chardet.universaldetector import UniversalDetector +from chardet import UniversalDetector from typing_extensions import TypeGuard from .. import dp_logging, rng_utils From 3972c70b3cb40577e8d599133966d6fa549a42eb Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Fri, 13 Mar 2026 12:21:42 -0500 Subject: [PATCH 2/6] revert: chardet import --- dataprofiler/data_readers/data_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 80296f16..611d25dc 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -29,7 +29,7 @@ import pandas as pd import pyarrow.parquet as pq import requests -from chardet import UniversalDetector +from chardet.universaldetector import UniversalDetector from typing_extensions import TypeGuard from .. import dp_logging, rng_utils From 16640b411b26cabe9b3e5d50175f9a5be51c1d93 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Fri, 13 Mar 2026 12:22:54 -0500 Subject: [PATCH 3/6] fix: chardet requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e32f3285..ea6d97db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ pandas>=1.1.2 python-dateutil>=2.7.5 pytz>=2020.1 pyarrow>=1.0.1 -chardet>=3.0.4 +chardet>=3.0.4,<7.0.0 fastavro>=1.1.0 python-snappy>=0.7.1 charset-normalizer>=1.3.6 From 88137dbd184dabb55eecefc9d5d94d1260bb3ba6 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Fri, 13 Mar 2026 12:44:38 -0500 Subject: [PATCH 4/6] fix: pandas req limit --- .pre-commit-config.yaml | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 666cde4b..3f857386 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -51,7 +51,7 @@ repos: h5py>=2.10.0, wheel>=0.33.1, numpy<2.0.0, - pandas>=1.1.2, + 'pandas>=1.1.2,<3.0.0', python-dateutil>=2.7.5, pytz>=2020.1, pyarrow>=1.0.1, diff --git a/requirements.txt b/requirements.txt index ea6d97db..1036c433 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ h5py>=2.10.0 wheel>=0.33.1 numpy<2.0.0 -pandas>=1.1.2 +pandas>=1.1.2,<3.0.0 python-dateutil>=2.7.5 pytz>=2020.1 pyarrow>=1.0.1 From a344a26576ce8ed4f5c239a5e92203a056374be2 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Fri, 13 Mar 2026 13:06:22 -0500 Subject: [PATCH 5/6] fix: tests with approx --- .../test_categorical_column_profile.py | 5 ++++ .../test_column_profile_compilers.py | 29 +++++++++++++++++++ .../profilers/test_float_column_profile.py | 5 ++++ .../profilers/test_int_column_profile.py | 5 ++++ .../tests/profilers/test_profile_builder.py | 8 +++-- .../profilers/test_text_column_profile.py | 5 ++++ 6 files changed, 55 insertions(+), 2 deletions(-) diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index 55d2ea68..6672b416 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -784,6 +784,11 @@ def test_categorical_diff(self): "categorical_count": {"y": 1, "n": 1, "maybe": -1}, }, } + self.assertAlmostEqual( + expected_diff.get("statistics").get("chi2-test").pop("p-value"), + actual_diff.get("statistics").get("chi2-test").pop("p-value"), + places=10, + ) self.assertDictEqual(expected_diff, profile.diff(profile2)) def test_unalikeability(self): diff --git a/dataprofiler/tests/profilers/test_column_profile_compilers.py b/dataprofiler/tests/profilers/test_column_profile_compilers.py index 1e0afc12..2617ccc6 100644 --- a/dataprofiler/tests/profilers/test_column_profile_compilers.py +++ b/dataprofiler/tests/profilers/test_column_profile_compilers.py @@ -256,6 +256,19 @@ def test_diff_primitive_compilers(self): profile_diff["statistics"].pop("median_absolute_deviation"), places=2, ) + self.assertAlmostEqual( + expected_diff["statistics"].get("t-test").get("welch").pop("p-value"), + profile_diff["statistics"].get("t-test").get("welch").pop("p-value"), + places=10, + ) + self.assertAlmostEqual( + expected_diff["statistics"] + .get("t-test") + .get("conservative") + .pop("p-value"), + profile_diff["statistics"].get("t-test").get("conservative").pop("p-value"), + places=10, + ) self.assertDictEqual(expected_diff, profile_diff) # Test different compilers @@ -354,6 +367,22 @@ def test_disabling_columns_during_primitive_diff(self): profile_diff["statistics"].pop("median_absolute_deviation"), places=2, ) + self.assertAlmostEqual( + expected_diff.get("statistics").get("t-test").get("welch").pop("p-value"), + profile_diff.get("statistics").get("t-test").get("welch").pop("p-value"), + places=10, + ) + self.assertAlmostEqual( + expected_diff.get("statistics") + .get("t-test") + .get("conservative") + .pop("p-value"), + profile_diff.get("statistics") + .get("t-test") + .get("conservative") + .pop("p-value"), + places=10, + ) self.assertDictEqual(expected_diff, profile_diff) # Test disabling all columns in one compiler diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index d79fdd64..e3f93ed3 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -1728,6 +1728,11 @@ def test_diff(self): np.testing.assert_almost_equal( sorted(expected_diff_mode[i]), sorted(diff_mode[i]), 2 ) + self.assertAlmostEqual( + expected_diff.get("t-test").get("welch").pop("p-value"), + profile_diff.get("t-test").get("welch").pop("p-value"), + places=10, + ) self.assertAlmostEqual( expected_diff.pop("median_absolute_deviation"), profile_diff.pop("median_absolute_deviation"), diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py index 961b33c8..960e5318 100644 --- a/dataprofiler/tests/profilers/test_int_column_profile.py +++ b/dataprofiler/tests/profilers/test_int_column_profile.py @@ -1087,6 +1087,11 @@ def test_diff(self): np.testing.assert_almost_equal( sorted(expected_diff_mode[i]), sorted(diff_mode[i]), 2 ) + self.assertAlmostEqual( + expected_diff.get("t-test").get("welch").pop("p-value"), + profile_diff.get("t-test").get("welch").pop("p-value"), + places=10, + ) self.assertAlmostEqual( expected_diff.pop("median_absolute_deviation"), profile_diff.pop("median_absolute_deviation"), diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index c4e60473..0c48051b 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -2156,9 +2156,13 @@ def test_diff_categorical_chi2_test(self, *mocks): "deg_of_free": 2, "p-value": 0.3099238764710244, } - self.assertDictEqual( - expected_chi2_test_dict, diff["data_stats"][0]["statistics"]["chi2-test"] + chi2_diff = diff["data_stats"][0]["statistics"]["chi2-test"] + self.assertAlmostEqual( + expected_chi2_test_dict.pop("p-value"), + chi2_diff.pop("p-value"), + places=10, ) + self.assertDictEqual(expected_chi2_test_dict, chi2_diff) @mock.patch( "dataprofiler.profilers.data_labeler_column_profile.DataLabelerColumn.update" diff --git a/dataprofiler/tests/profilers/test_text_column_profile.py b/dataprofiler/tests/profilers/test_text_column_profile.py index 12fb1d27..f45d5c77 100644 --- a/dataprofiler/tests/profilers/test_text_column_profile.py +++ b/dataprofiler/tests/profilers/test_text_column_profile.py @@ -607,6 +607,11 @@ def test_diff(self): np.testing.assert_almost_equal( sorted(expected_diff_mode[i]), sorted(diff_mode[i]), 2 ) + self.assertAlmostEqual( + expected_diff.get("t-test").get("welch").pop("p-value"), + profile_diff.get("t-test").get("welch").pop("p-value"), + places=10, + ) self.assertAlmostEqual( expected_diff.pop("median_absolute_deviation"), profile_diff.pop("median_absolute_deviation"), From ce78b287bf922bf902bfd42257cccb7bad1bc506 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Fri, 13 Mar 2026 13:17:16 -0500 Subject: [PATCH 6/6] fix: missing test fix --- .../tests/profilers/test_categorical_column_profile.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index 6672b416..c5607cf9 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -731,6 +731,11 @@ def test_categorical_diff(self): }, } actual_diff = profile.diff(profile2) + self.assertAlmostEqual( + expected_diff.get("statistics").get("chi2-test").pop("p-value"), + actual_diff.get("statistics").get("chi2-test").pop("p-value"), + places=10, + ) self.assertDictEqual(expected_diff, actual_diff) # Test with one categorical column matching @@ -784,11 +789,6 @@ def test_categorical_diff(self): "categorical_count": {"y": 1, "n": 1, "maybe": -1}, }, } - self.assertAlmostEqual( - expected_diff.get("statistics").get("chi2-test").pop("p-value"), - actual_diff.get("statistics").get("chi2-test").pop("p-value"), - places=10, - ) self.assertDictEqual(expected_diff, profile.diff(profile2)) def test_unalikeability(self):