-
Notifications
You must be signed in to change notification settings - Fork 1.7k
ISSUE #26706 - Migrate Failed sample rows to OS #26780
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 10 commits
d5442fc
3ce0e0e
5bdcdfe
24ea87a
7b56fc0
66deb06
ae74520
d9b630e
88cbf9f
7c84b8c
3534c87
9d77479
07a49cc
620209c
7a183f4
45f9b11
a532971
71f2b68
e9fc955
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,6 +32,7 @@ | |
|
|
||
| from pydantic import BaseModel | ||
|
|
||
| from metadata.data_quality.api.models import TestCaseResultResponse | ||
| from metadata.data_quality.validations import utils | ||
| from metadata.data_quality.validations.impact_score import ( | ||
| DEFAULT_TOP_DIMENSIONS, | ||
|
|
@@ -134,7 +135,7 @@ def _get_top_dimensions(self) -> int: | |
| return DEFAULT_TOP_DIMENSIONS | ||
| return min(value, MAX_TOP_DIMENSIONS) | ||
|
|
||
| def run_validation(self) -> TestCaseResult: | ||
| def run_validation(self) -> TestCaseResultResponse: | ||
| """Template method defining the validation flow with optional dimensional analysis | ||
| This method orchestrates the overall validation process: | ||
|
Comment on lines
135
to
141
|
||
|
|
@@ -186,7 +187,20 @@ def run_validation(self) -> TestCaseResult: | |
| ) | ||
| logger.debug(traceback.format_exc()) | ||
|
|
||
| return test_result | ||
| result: TestCaseResultResponse = TestCaseResultResponse( | ||
| testCaseResult=test_result, testCase=self.test_case | ||
| ) | ||
|
|
||
| self.result_with_failed_samples(result) | ||
|
|
||
| return result | ||
|
||
|
|
||
| def result_with_failed_samples(self, result: TestCaseResultResponse) -> None: | ||
| """Hook for failed row sampling. No-op by default. | ||
| Overridden by FailedSampleValidatorMixin to fetch and stash | ||
| failed row samples on the validator instance. | ||
TeddyCr marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| """ | ||
|
|
||
| @abstractmethod | ||
| def _run_validation(self) -> TestCaseResult: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,6 +14,7 @@ | |
| """ | ||
|
|
||
| from collections import defaultdict | ||
| from datetime import datetime | ||
| from typing import List, Optional, cast | ||
|
|
||
| import pandas as pd | ||
|
|
@@ -27,20 +28,32 @@ | |
| BaseColumnValuesToBeBetweenValidator, | ||
| ) | ||
| from metadata.data_quality.validations.impact_score import calculate_impact_score_pandas | ||
| from metadata.data_quality.validations.mixins.failed_row_sampler_mixin import ( | ||
| PandasFailedRowSamplerMixin, | ||
| ) | ||
| from metadata.data_quality.validations.mixins.failed_sample_validator_mixin import ( | ||
| FailedSampleValidatorMixin, | ||
| ) | ||
| from metadata.data_quality.validations.mixins.pandas_validator_mixin import ( | ||
| PandasValidatorMixin, | ||
| aggregate_others_statistical_pandas, | ||
| ) | ||
| from metadata.generated.schema.entity.data.table import TableData | ||
| from metadata.generated.schema.tests.dimensionResult import DimensionResult | ||
| from metadata.profiler.metrics.registry import Metrics | ||
| from metadata.profiler.orm.registry import is_date_time | ||
| from metadata.utils.logger import test_suite_logger | ||
| from metadata.utils.sqa_like_column import SQALikeColumn | ||
| from metadata.utils.time_utils import convert_timestamp | ||
|
|
||
| logger = test_suite_logger() | ||
|
|
||
|
|
||
| class ColumnValuesToBeBetweenValidator( | ||
| BaseColumnValuesToBeBetweenValidator, PandasValidatorMixin | ||
| FailedSampleValidatorMixin, | ||
| BaseColumnValuesToBeBetweenValidator, | ||
| PandasValidatorMixin, | ||
| PandasFailedRowSamplerMixin, | ||
| ): | ||
| """Validator for column values to be between test case""" | ||
|
|
||
|
|
@@ -237,3 +250,34 @@ def compute_row_count(self, column: SQALikeColumn, min_bound: int, max_bound: in | |
| ) | ||
|
|
||
| return row_count, failed_rows | ||
|
|
||
| def filter(self): | ||
| column = self.get_column() | ||
| if is_date_time(column.type): | ||
| min_bound = self.get_test_case_param_value( | ||
| self.test_case.parameterValues, | ||
| "minValue", | ||
| type_=datetime.fromtimestamp, | ||
| default=datetime.min, | ||
| pre_processor=convert_timestamp, | ||
| ) | ||
|
Comment on lines
+256
to
+263
|
||
| max_bound = self.get_test_case_param_value( | ||
| self.test_case.parameterValues, | ||
| "maxValue", | ||
| type_=datetime.fromtimestamp, | ||
| default=datetime.max, | ||
| pre_processor=convert_timestamp, | ||
| ) | ||
| else: | ||
| min_bound = self.get_min_bound("minValue") | ||
| max_bound = self.get_max_bound("maxValue") | ||
| filters = [] | ||
| if min_bound is not None: | ||
| filters.append(f"{column.name} < {min_bound}") | ||
| if max_bound is not None: | ||
| filters.append(f"{column.name} > {max_bound}") | ||
| return " or ".join(filters) | ||
TeddyCr marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| def fetch_failed_rows_sample(self): | ||
| cols, rows = self._get_failed_rows_sample() | ||
| return TableData(columns=cols, rows=rows) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
run_validation()is now typed to returnTestCaseResultResponse, but there is still an early return in this method that returnstest_result(aTestCaseResult) when dimension columns are invalid. That creates an inconsistent return type and will break callers expecting the response wrapper; wrap the early-return path inTestCaseResultResponseas well.