Skip to content

Commit 679dbb4

Browse files
Closes OPEN-4056 Zero-indexed labels validation doesn't allow datasets with only a sample of the labels to be uploaded and Closes OPEN-4044 Add a data type validation prior to the other prediction checks
1 parent 14e5bf4 commit 679dbb4

File tree

2 files changed

+22
-23
lines changed

2 files changed

+22
-23
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
2020

2121
### Changed
2222

23+
* Modified the zero-index integer checks for `predictionsColumnName` and `labelColumnName` to support dataset uploads with only a sample of the classes.
2324
* Renamed `predictionsColumnName` argument from the datasets' configuration YAML to `predictionScoresColumnName`.
2425
* Migrated package name from [openlayer](https://pypi.org/project/openlayer/) to [openlayer](https://pypi.org/project/openlayer/) due to a company name change.
2526
* Required Python version `>=3.7` and `<3.9`.

openlayer/validators/dataset_validators.py

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -264,34 +264,35 @@ def _validate_labels(self):
264264
)
265265
else:
266266
if self.class_names:
267-
self._validate_all_categories_in_class_names(
268-
column_name=self.label_column_name
269-
)
270267
self._validate_categories_zero_indexed(
271268
column_name=self.label_column_name
272269
)
273270

274-
def _validate_all_categories_in_class_names(self, column_name: str):
275-
"""Checks whether there are categories in the dataset's `column_name` which are not
276-
in the `class_names`."""
277-
num_classes = len(self.dataset_df[column_name].unique())
278-
if num_classes > len(self.class_names):
279-
self.failed_validations.append(
280-
"There are more classes in the dataset's column"
281-
f" `{column_name}` than specified in `classNames`. "
282-
"Please specify all possible labels in the `classNames` list."
283-
)
284-
285271
def _validate_categories_zero_indexed(self, column_name: str):
286272
"""Checks whether the categories are zero-indexed in the dataset's `column_name`."""
287-
unique_labels = set(self.dataset_df[column_name].unique())
288-
zero_indexed_set = set(range(len(self.class_names)))
289-
if unique_labels != zero_indexed_set:
273+
if (
274+
self.dataset_df[column_name].dtype.name != "int64"
275+
and self.dataset_df[column_name].dtype.name != "int32"
276+
):
290277
self.failed_validations.append(
291-
"The classes in the dataset are not zero-indexed. "
292-
f"Make sure that the classes in the column `{column_name}` "
293-
"are zero-indexed integers that match the list in `classNames`."
278+
f"The classes in the dataset column `{column_name}` must be integers. "
279+
f"Make sure that the column `{column_name}` is of dtype `int32` or `int64`."
294280
)
281+
else:
282+
max_class = self.dataset_df[column_name].max()
283+
284+
if max_class > len(self.class_names) - 1:
285+
self.failed_validations.append(
286+
"The classes in the dataset are not zero-indexed. "
287+
f"The column `{column_name}` contains classes up to {max_class}, "
288+
f"but the list of classes provided in `classNames` contains only "
289+
f"{len(self.class_names)} elements. "
290+
f"Make sure that the classes in the column `{column_name}` "
291+
"are zero-indexed integers that match the list in `classNames`. "
292+
"Note that the index of the first class should be 0, not 1, so "
293+
f"if the maximum class is {max_class}, the `classNames` list "
294+
f"should contain {max_class + 1} elements."
295+
)
295296

296297
def _validate_predictions(self):
297298
"""Validates the data in the predictions column."""
@@ -302,9 +303,6 @@ def _validate_predictions(self):
302303
)
303304
else:
304305
if self.class_names:
305-
self._validate_all_categories_in_class_names(
306-
column_name=self.predictions_column_name
307-
)
308306
self._validate_categories_zero_indexed(
309307
column_name=self.predictions_column_name
310308
)

0 commit comments

Comments
 (0)