improvement: allow specifying dataset as path for uploads

whoseoyster · whoseoyster · commit 9167c1de86f0 · 2024-07-31T10:10:14.000-07:00
diff --git a/src/openlayer/lib/data/batch_inferences.py b/src/openlayer/lib/data/batch_inferences.py
@@ -19,12 +19,18 @@
 def upload_batch_inferences(
     client: Openlayer,
     inference_pipeline_id: str,
-    dataset_df: pd.DataFrame,
     config: data_stream_params.Config,
+    dataset_df: Optional[pd.DataFrame] = None,
+    dataset_path: Optional[str] = None,
     storage_type: Optional[StorageType] = None,
     merge: bool = False,
 ) -> None:
     """Uploads a batch of inferences to the Openlayer platform."""
+    if dataset_df is None and dataset_path is None:
+        raise ValueError("Either dataset_df or dataset_path must be provided.")
+    if dataset_df is not None and dataset_path is not None:
+        raise ValueError("Only one of dataset_df or dataset_path should be provided.")
+
     uploader = _upload.Uploader(client, storage_type)
     object_name = f"batch_data_{time.time()}_{inference_pipeline_id}.tar.gz"
 
@@ -35,8 +41,11 @@ def upload_batch_inferences(
 
     # Write dataset and config to temp directory
     with tempfile.TemporaryDirectory() as tmp_dir:
-        temp_file_path = f"{tmp_dir}/dataset.csv"
-        dataset_df.to_csv(temp_file_path, index=False)
+        if dataset_df is not None:
+            temp_file_path = f"{tmp_dir}/dataset.csv"
+            dataset_df.to_csv(temp_file_path, index=False)
+        else:
+            temp_file_path = dataset_path
 
         # Copy relevant files to tmp dir
         config["label"] = "production"
@@ -47,7 +56,11 @@ def upload_batch_inferences(
 
         tar_file_path = os.path.join(tmp_dir, object_name)
         with tarfile.open(tar_file_path, mode="w:gz") as tar:
-            tar.add(tmp_dir, arcname=os.path.basename("monitoring_data"))
+            tar.add(temp_file_path, arcname=os.path.basename("dataset.csv"))
+            tar.add(
+                f"{tmp_dir}/dataset_config.yaml",
+                arcname=os.path.basename("dataset_config.yaml"),
+            )
 
         # Upload to storage
         uploader.upload(