Allow validation sets without model outputs for LLMs (script and api)

gustavocidornelas · whoseoyster · commit cf4dd921bb73 · 2023-08-23T08:54:22.000-07:00
diff --git a/openlayer/__init__.py b/openlayer/__init__.py
@@ -27,6 +27,7 @@
 import tempfile
 import time
 import uuid
+import warnings
 from typing import Optional
 
 import pandas as pd
@@ -937,15 +938,47 @@ def commit(self, message: str, project_id: str, force: bool = False):
                 print("Keeping the existing commit message.")
                 return
 
+        llm_and_no_outputs = self._check_llm_and_no_outputs(project_dir=project_dir)
+        if llm_and_no_outputs:
+            warnings.warn(
+                "You are committing an LLM without validation outputs computed "
+                "in the validation set. This means that the platform will try to "
+                "compute the validation outputs for you. This may take a while and "
+                "there are costs associated with it."
+            )
         commit = {
             "message": message,
             "date": time.ctime(),
+            "computeOutputs": llm_and_no_outputs,
         }
         with open(f"{project_dir}/commit.yaml", "w", encoding="UTF-8") as commit_file:
             yaml.dump(commit, commit_file)
 
         print("Committed!")
 
+    def _check_llm_and_no_outputs(self, project_dir: str) -> bool:
+        """Checks if the project's staging area contains an LLM and no outputs."""
+        # Check if validation set has outputs
+        validation_has_no_outputs = False
+        if os.path.exists(f"{project_dir}/validation"):
+            validation_dataset_config = utils.load_dataset_config_from_bundle(
+                bundle_path=project_dir, label="validation"
+            )
+            output_column_name = validation_dataset_config.get("outputColumnName")
+            validation_has_no_outputs = output_column_name is None
+
+        # Check if the model is an LLM
+        model_is_llm = False
+        if os.path.exists(f"{project_dir}/model"):
+            model_config = utils.read_yaml(f"{project_dir}/model/model_config.yaml")
+            architecture_type = model_config.get("architectureType")
+            model_type = model_config.get("modelType")
+
+            if architecture_type == "llm" and model_type != "shell":
+                model_is_llm = True
+
+        return validation_has_no_outputs and model_is_llm
+
     def push(self, project_id: str, task_type: TaskType) -> Optional[ProjectVersion]:
         """Pushes the commited resources to the platform.
 
diff --git a/openlayer/schemas.py b/openlayer/schemas.py
@@ -153,7 +153,8 @@ class LLMOutputSchema(BaseDatasetSchema):
     )
     outputColumnName = ma.fields.Str(
         validate=COLUMN_NAME_VALIDATION_LIST,
-        required=True,
+        allow_none=True,
+        load_default=None,
     )
 
 
diff --git a/openlayer/validators/commit_validators.py b/openlayer/validators/commit_validators.py
@@ -6,6 +6,7 @@
 
 import marshmallow as ma
 import pandas as pd
+import yaml
 
 from .. import schemas, tasks, utils
 from . import baseline_model_validators, dataset_validators, model_validators
@@ -127,6 +128,11 @@ def _validate_bundle_state(self):
                 label="validation"
             )
 
+        # Check if flagged to compute the model outputs
+        with open(f"{self.bundle_path}/commit.yaml", "r") as commit_file:
+            commit = yaml.safe_load(commit_file)
+        compute_outputs = commit.get("computeOutputs", False)
+
         if "model" in self._bundle_resources:
             model_type = self.model_config.get("modelType")
 
@@ -163,7 +169,7 @@ def _validate_bundle_state(self):
                     "training" not in self._bundle_resources
                     or "fine-tuning" not in self._bundle_resources
                 ) and ("validation" in self._bundle_resources):
-                    if not outputs_in_validation_set:
+                    if not outputs_in_validation_set and not compute_outputs:
                         self.failed_validations.append(
                             "You are trying to push a model and a validation set to the platform. "
                             "However, the validation set does not contain predictions. "
@@ -186,7 +192,9 @@ def _validate_bundle_state(self):
                     "training" in self._bundle_resources
                     or "fine-tuning" in self._bundle_resources
                 ) and ("validation" in self._bundle_resources):
-                    if not outputs_in_training_set or not outputs_in_validation_set:
+                    if (
+                        not outputs_in_training_set or not outputs_in_validation_set
+                    ) and not compute_outputs:
                         self.failed_validations.append(
                             "You are trying to push a model, a training/fine-tuning set and a validation "
                             "set to the platform. "

Original file line number	Diff line number	Diff line change
`@@ -153,7 +153,8 @@ class LLMOutputSchema(BaseDatasetSchema):`
`153`	`153`	`)`
`154`	`154`	`outputColumnName = ma.fields.Str(`
`155`	`155`	`validate=COLUMN_NAME_VALIDATION_LIST,`
`156`		`- required=True,`
	`156`	`+ allow_none=True,`
	`157`	`+ load_default=None,`
`157`	`158`	`)`
`158`	`159`
`159`	`160`