add token/revision in source + better error message

lhoestq · web-flow · commit eabf403e7839 · 2025-03-14T18:41:05.000+01:00
diff --git a/pyspark_huggingface/huggingface_source.py b/pyspark_huggingface/huggingface_source.py
@@ -80,12 +80,14 @@ def __init__(self, options):
 
         if "path" not in options or not options["path"]:
             raise Exception("You must specify a dataset name.")
-
+        
         kwargs = dict(self.options)
         self.dataset_name = kwargs.pop("path")
         self.config_name = kwargs.pop("config", None)
         self.split = kwargs.pop("split", self.DEFAULT_SPLIT)
+        self.revision = kwargs.pop("revision", None)
         self.streaming = kwargs.pop("streaming", "true").lower() == "true"
+        self.token = kwargs.pop("token", None)
         for arg in kwargs:
             if kwargs[arg].lower() == "true":
                 kwargs[arg] = True
@@ -96,8 +98,12 @@ def __init__(self, options):
                     kwargs[arg] = ast.literal_eval(kwargs[arg])
                 except ValueError:
                     pass
+                    
+        # Raise the right error if the dataset doesn't exist
+        api = self._get_api()
+        api.repo_info(self.dataset_name, repo_type="dataset", revision=self.revision)
 
-        self.builder = load_dataset_builder(self.dataset_name, self.config_name, **kwargs)
+        self.builder = load_dataset_builder(self.dataset_name, self.config_name, token=self.token, revision=self.revision, **kwargs)
         streaming_dataset = self.builder.as_streaming_dataset()
         if self.split not in streaming_dataset:
             raise Exception(f"Split {self.split} is invalid. Valid options are {list(streaming_dataset)}")
@@ -106,6 +112,11 @@ def __init__(self, options):
         if not self.streaming_dataset.features:
             self.streaming_dataset = self.streaming_dataset._resolve_features()
 
+    def _get_api(self):
+        from huggingface_hub import HfApi
+
+        return HfApi(token=self.token, library_name="pyspark_huggingface")
+
     @classmethod
     def name(cls):
         return "huggingfacesource"