Skip to content

Commit dd17bf9

Browse files
committed
add SGD to benchmarks and use the current credentials to obtain the tenant, subscription, and principal ids
1 parent fa9e94e commit dd17bf9

File tree

1 file changed

+38
-28
lines changed

1 file changed

+38
-28
lines changed

docs/benchmarks/ebm-benchmark.ipynb

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,9 @@
117117
" \"rf_sk\",\n",
118118
" \"ert\",\n",
119119
" \"elastic\",\n",
120+
" \"sgd\",\n",
120121
" \"lm\",\n",
121-
" \"lm_svm\",\n",
122+
" \"lsvm\",\n",
122123
" \"svm\",\n",
123124
" \"nn\",\n",
124125
" \"knn\",\n",
@@ -136,13 +137,14 @@
136137
"def trial_runner(trial):\n",
137138
" seed=42 + int(trial.replicate_num)\n",
138139
" max_samples = 1000000000000\n",
140+
" n_calibration_folds = 4 # 4 uses all cores on the containers\n",
139141
"\n",
140142
" from interpret.glassbox import ExplainableBoostingClassifier, ExplainableBoostingRegressor\n",
141143
" from xgboost import XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor\n",
142144
" from lightgbm import LGBMClassifier, LGBMRegressor\n",
143145
" from catboost import CatBoostClassifier, CatBoostRegressor\n",
144146
" from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor\n",
145-
" from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet\n",
147+
" from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, SGDClassifier, SGDRegressor\n",
146148
" from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR\n",
147149
" from sklearn.neural_network import MLPClassifier, MLPRegressor\n",
148150
" from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor\n",
@@ -178,7 +180,7 @@
178180
" pass # Re-enable stratification if dataset fails from absent class in train/test sets (PMLB)\n",
179181
" \n",
180182
" fit_params = {}\n",
181-
" fit_params[\"X\"], X_test, fit_params[\"y\"], y_test = train_test_split(X, y, test_size=0.3, stratify=stratification, random_state=seed)\n",
183+
" fit_params[\"X\"], X_test, fit_params[\"y\"], y_test = train_test_split(X, y, test_size=0.2, stratify=stratification, random_state=seed)\n",
182184
" del X\n",
183185
"\n",
184186
" # Build optional preprocessor for use by methods below\n",
@@ -197,8 +199,9 @@
197199
" rf_sk_params = {}\n",
198200
" ert_params = {}\n",
199201
" elastic_params = {}\n",
202+
" sgd_params = {}\n",
200203
" lm_params = {}\n",
201-
" lm_svm_params = {}\n",
204+
" lsvm_params = {}\n",
202205
" svm_params = {}\n",
203206
" nn_params = {}\n",
204207
" knn_params = {}\n",
@@ -212,15 +215,16 @@
212215
" catboost_params[\"verbose\"] = False\n",
213216
" rf_xgb_params[\"enable_categorical\"] = True\n",
214217
" rf_xgb_params[\"feature_types\"] = [\"c\" if cat else \"q\" for cat in cat_bools]\n",
215-
" rf_sk_params[\"random_state\"] = seed # TODO: is this needed for reproducibility?\n",
218+
" rf_sk_params[\"random_state\"] = seed\n",
216219
" rf_sk_params[\"n_jobs\"] = -1\n",
217220
" ert_params[\"n_jobs\"] = -1\n",
218-
" ert_params[\"random_state\"] = seed # TODO: is this needed for reproducibility?\n",
219-
" elastic_params[\"random_state\"] = seed # TODO: is this needed for reproducibility?\n",
221+
" ert_params[\"random_state\"] = seed\n",
222+
" elastic_params[\"random_state\"] = seed\n",
220223
" # elastic_params[\"selection\"] = 'cyclic' # 'random' # TODO: try both\n",
224+
" sgd_params[\"random_state\"] = seed\n",
221225
" lm_params[\"n_jobs\"] = -1\n",
222-
" lm_svm_params[\"random_state\"] = seed # TODO: is this needed for reproducibility?\n",
223-
" nn_params[\"random_state\"] = seed # TODO: is this needed for reproducibility?\n",
226+
" lsvm_params[\"random_state\"] = seed\n",
227+
" nn_params[\"random_state\"] = seed\n",
224228
" knn_params[\"n_jobs\"] = -1\n",
225229
" aplr_params[\"m\"] = 3000\n",
226230
"\n",
@@ -241,7 +245,8 @@
241245
" #rf_sk_params[\"n_estimators\"] = 1\n",
242246
" #ert_params[\"n_estimators\"] = 1\n",
243247
" #elastic_params[\"max_iter\"] = 1\n",
244-
" #lm_svm_params[\"max_iter\"] = 1\n",
248+
" #sgd_params[\"max_iter\"] = 1\n",
249+
" #lsvm_params[\"max_iter\"] = 1\n",
245250
" #nn_params[\"max_iter\"] = 1\n",
246251
" #knn_params[\"n_neighbors\"] = 1\n",
247252
" #knn_params[\"leaf_size\"] = 1\n",
@@ -270,30 +275,28 @@
270275
" elif trial.method.name == \"elastic\":\n",
271276
" elastic_params[\"n_jobs\"] = -1\n",
272277
" est = Pipeline([(\"ct\", ct), (\"est\", LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, **elastic_params))])\n",
278+
" elif trial.method.name == \"sgd\":\n",
279+
" est = Pipeline([(\"ct\", ct), (\"est\", CalibratedClassifierCV(SGDClassifier(**sgd_params), n_jobs=-1, cv=n_calibration_folds))])\n",
273280
" elif trial.method.name == \"lm\":\n",
274-
" lm_params[\"random_state\"] = seed # TODO: is this needed for reproducibility?\n",
281+
" lm_params[\"random_state\"] = seed\n",
275282
" est = Pipeline([(\"ct\", ct), (\"est\", LogisticRegression(**lm_params))])\n",
276-
" elif trial.method.name == \"lm_svm\":\n",
283+
" elif trial.method.name == \"lsvm\":\n",
277284
" if trial.task.name in {\"CIFAR_10\", \"Devnagari-Script\"}:\n",
278285
" max_samples = 10000 # crashes or fit time too long without subsampling\n",
279-
" if trial.task.problem == \"multiclass\":\n",
280-
" est = Pipeline([(\"ct\", ct), (\"est\", CalibratedClassifierCV(OneVsRestClassifier(LinearSVC(**lm_svm_params), n_jobs=-1)))])\n",
281-
" else:\n",
282-
" est = Pipeline([(\"ct\", ct), (\"est\", CalibratedClassifierCV(LinearSVC(**lm_svm_params), n_jobs=-1))])\n",
286+
" est = Pipeline([(\"ct\", ct), (\"est\", CalibratedClassifierCV(LinearSVC(**lsvm_params), n_jobs=-1, cv=n_calibration_folds))])\n",
283287
" elif trial.method.name == \"svm\":\n",
284288
" if trial.task.name in {\"CIFAR_10\", \"Devnagari-Script\"}:\n",
285289
" max_samples = 10000 # crashes or fit time too long without subsampling\n",
286290
" svm_params[\"random_state\"] = seed\n",
287-
" if trial.task.problem == \"multiclass\":\n",
288-
" est = Pipeline([(\"ct\", ct), (\"est\", CalibratedClassifierCV(OneVsRestClassifier(SVC(**svm_params), n_jobs=-1)))])\n",
289-
" else:\n",
290-
" est = Pipeline([(\"ct\", ct), (\"est\", CalibratedClassifierCV(SVC(**svm_params), n_jobs=-1))])\n",
291+
" est = Pipeline([(\"ct\", ct), (\"est\", CalibratedClassifierCV(SVC(**svm_params), n_jobs=-1, cv=n_calibration_folds))])\n",
291292
" elif trial.method.name == \"nn\":\n",
292293
" est = Pipeline([(\"ct\", ct), (\"est\", MLPClassifier(**nn_params))])\n",
293294
" elif trial.method.name == \"knn\":\n",
294295
" est = Pipeline([(\"ct\", ct), (\"est\", KNeighborsClassifier(**knn_params))])\n",
295296
" elif trial.method.name == \"aplr\":\n",
296297
" ct.sparse_threshold = 0 # APLR only handles dense\n",
298+
" if trial.task.name in {\"CIFAR_10\", \"Fashion-MNIST\", \"Devnagari-Script\", \"mnist_784\"}:\n",
299+
" max_samples = 10000 # crashes or fit time too long without subsampling\n",
297300
" est = Pipeline([(\"ct\", ct), (\"est\", APLRClassifier(**aplr_params))])\n",
298301
" fit_params[\"y\"] = fit_params[\"y\"].astype(str).to_numpy()\n",
299302
" y_test = y_test.astype(str).to_numpy()\n",
@@ -326,12 +329,14 @@
326329
" est = Pipeline([(\"ct\", ct), (\"est\", ExtraTreesRegressor(**ert_params))])\n",
327330
" elif trial.method.name == \"elastic\":\n",
328331
" est = Pipeline([(\"ct\", ct), (\"est\", ElasticNet(**elastic_params))])\n",
332+
" elif trial.method.name == \"sgd\":\n",
333+
" est = Pipeline([(\"ct\", ct), (\"est\", SGDRegressor(**sgd_params))])\n",
329334
" elif trial.method.name == \"lm\":\n",
330335
" est = Pipeline([(\"ct\", ct), (\"est\", LinearRegression(**lm_params))])\n",
331-
" elif trial.method.name == \"lm_svm\":\n",
332-
" est = Pipeline([(\"ct\", ct), (\"est\", LinearSVR(**lm_svm_params))])\n",
336+
" elif trial.method.name == \"lsvm\":\n",
337+
" est = Pipeline([(\"ct\", ct), (\"est\", LinearSVR(**lsvm_params))])\n",
333338
" elif trial.method.name == \"svm\":\n",
334-
" if trial.task.name in {\"Buzzinsocialmedia_Twitter\", \"nyc-taxi-green-dec-2016\", \"Airlines_DepDelay_10M\"}:\n",
339+
" if trial.task.name in {\"Buzzinsocialmedia_Twitter\", \"nyc-taxi-green-dec-2016\", \"Airlines_DepDelay_10M\", \"Yolanda\"}:\n",
335340
" max_samples = 100000 # crashes or fit time too long without subsampling\n",
336341
" est = Pipeline([(\"ct\", ct), (\"est\", SVR(**svm_params))])\n",
337342
" elif trial.method.name == \"nn\":\n",
@@ -341,9 +346,9 @@
341346
" max_samples = 100000 # crashes or fit time too long without subsampling\n",
342347
" est = Pipeline([(\"ct\", ct), (\"est\", KNeighborsRegressor(**knn_params))])\n",
343348
" elif trial.method.name == \"aplr\":\n",
349+
" ct.sparse_threshold = 0 # APLR only handles dense\n",
344350
" if trial.task.name in {\"Airlines_DepDelay_10M\"}:\n",
345351
" max_samples = 100000 # crashes or fit time too long without subsampling\n",
346-
" ct.sparse_threshold = 0 # APLR only handles dense\n",
347352
" est = Pipeline([(\"ct\", ct), (\"est\", APLRRegressor(**aplr_params))])\n",
348353
" fit_params[\"y\"] = fit_params[\"y\"].astype(str).to_numpy()\n",
349354
" y_test = y_test.astype(str).to_numpy()\n",
@@ -441,19 +446,24 @@
441446
"if is_local:\n",
442447
" conn_str = f\"sqlite:///{os.getcwd()}/powerlift.db\"\n",
443448
"else:\n",
449+
" import requests\n",
450+
" import json\n",
451+
" import subprocess\n",
444452
" from azure.identity import AzureCliCredential\n",
445453
" credential = AzureCliCredential()\n",
454+
" access_token = credential.get_token(\"https://graph.microsoft.com/.default\").token\n",
455+
" headers = {'Authorization': f'Bearer {access_token}', 'Content-Type': 'application/json'}\n",
456+
" azure_client_id = requests.get('https://graph.microsoft.com/v1.0/me', headers=headers).json().get('id')\n",
457+
" azure_tenant_id = requests.get('https://graph.microsoft.com/v1.0/organization', headers=headers).json()['value'][0].get('id')\n",
458+
" subscription_id = json.loads(subprocess.run(\"az account show\", capture_output=True, text=True, shell=True).stdout).get(\"id\")\n",
446459
" \n",
447460
" from dotenv import load_dotenv\n",
448461
" load_dotenv()\n",
449462
" TIMEOUT_SEC = 60 * 60 * 24 * 180 # 180 days\n",
450463
" wheel_filepaths = [\"interpret_core-0.6.3-py3-none-any.whl\", \"powerlift-0.1.11-py3-none-any.whl\"]\n",
451464
" n_containers=198\n",
452465
" conn_str = os.getenv(\"DOCKER_DB_URL\")\n",
453-
" azure_tenant_id = os.getenv(\"AZURE_TENANT_ID\")\n",
454-
" azure_client_id = os.getenv(\"AZURE_CLIENT_ID\")\n",
455-
" azure_client_secret = os.getenv(\"AZURE_CLIENT_SECRET\")\n",
456-
" subscription_id = os.getenv(\"AZURE_SUBSCRIPTION_ID\")\n",
466+
" azure_client_secret = None # use default credentials instead\n",
457467
" resource_group = os.getenv(\"AZURE_RESOURCE_GROUP\")\n",
458468
"\n",
459469
"from powerlift.bench import retrieve_openml_automl_regression, retrieve_openml_automl_classification, retrieve_openml_cc18, retrieve_catboost_50k, retrieve_pmlb\n",

0 commit comments

Comments
 (0)