Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
repos:
- repo: https://github.com/kynan/nbstripout
rev: 0.8.1
hooks:
- id: nbstripout
- repo: https://github.com/psf/black
rev: 25.1.0
hooks:
- id: black-jupyter
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# A machine learning lecture <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a> [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/tudo-astroparticlephysics/machine-learning-lecture/main)
# A machine learning lecture <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a> [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/tudo-astroparticlephysics/machine-learning-lecture/main)

This collection of notebooks was started for a lecture on machine learning at the Universitat Autònoma de Barcelona.
It has since grown into a large part of the statistical methods lecture (SMD) at the Physics department at TU Dortmund University.
Expand Down Expand Up @@ -29,14 +29,14 @@ The lecture material (e.g. jupyter notebooks) are shared under the Creative Comm


## Install `conda`

To make sure, all needed packages are installed in an environment for these lectures, we use
`conda`.

Download and install [Anaconda](https://www.anaconda.com/products/individual#Downloads) for a large collection of packages or [Miniconda](https://docs.conda.io/en/latest/miniconda.html) for a minimal starting point.

## Setup the environment


After installing conda, run

```
Expand All @@ -52,6 +52,11 @@ $ conda activate ml
```
everytime before you start working on these lectures.

Before committing any changes, make sure the pre-commit hooks are installed:
```
$ pre-commit install
```

From time to time, we will update the `environment.yml` with new versions or
additional packages, to then update your environment, run:
```
Expand Down
37 changes: 25 additions & 12 deletions ani_kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,19 @@

k = 4
n_iters = 25
discrete_cmap = ListedColormap([f'C{i}' for i in range(k)])
discrete_cmap = ListedColormap([f"C{i}" for i in range(k)])
fps = 25
interval = 1000 / fps
time_per_iter = 1
frames = n_iters * time_per_iter * fps

# choose inital cluster centers
X, y = make_blobs(
n_samples=500, centers=k, center_box=(-2, 2),
cluster_std=0.5, random_state=1,
n_samples=500,
centers=k,
center_box=(-2, 2),
cluster_std=0.5,
random_state=1,
)

fig = plt.figure(figsize=(12.8, 7.2), dpi=100)
Expand All @@ -34,25 +37,26 @@


center_lines = [ax.plot([], [])[0] for _ in range(k)]
points = ax.scatter(X[:, 0], X[:, 1], c='k', cmap=discrete_cmap, alpha=0.5)
points = ax.scatter(X[:, 0], X[:, 1], c="k", cmap=discrete_cmap, alpha=0.5)
center_plot = ax.scatter(
init_centers[:, 0],
init_centers[:, 1],
c=np.arange(k),
cmap=discrete_cmap,
marker='h',
edgecolor='k',
marker="h",
edgecolor="k",
s=400,
label='cluster center',
label="cluster center",
)

ax.legend(loc='upper right')
ax.legend(loc="upper right")


def init():
t = ax.set_title('iteration 0')
t = ax.set_title("iteration 0")
return *center_lines, points, t


def update(frame, bar=None):
if bar is not None:
bar.update(1)
Expand All @@ -68,15 +72,24 @@ def update(frame, bar=None):
center_history[i] = init_centers

for j, line in enumerate(center_lines):
line.set_data(center_history[:i + 1, j, 0], center_history[:i + 1, j, 1])
line.set_data(center_history[: i + 1, j, 0], center_history[: i + 1, j, 1])

points.set_cmap(discrete_cmap)
t = ax.set_title('iteration {}'.format(i + 1))
t = ax.set_title("iteration {}".format(i + 1))

return *center_lines, points, t


bar = tqdm(total=frames)
ani = FuncAnimation(fig, update, blit=True, init_func=init, frames=frames, fargs=(bar,), interval=interval)
ani = FuncAnimation(
fig,
update,
blit=True,
init_func=init,
frames=frames,
fargs=(bar,),
interval=interval,
)
ani.save("kmeans_clustering.mp4")
ani.pause()
plt.close(fig)
16 changes: 10 additions & 6 deletions ml/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@
rng = np.random.default_rng(0)


colors = ['xkcd:sky', 'xkcd:grass']
colors = ["xkcd:sky", "xkcd:grass"]
cmap = ListedColormap(colors)


def create_discrete_colormap(n_classes):
if n_classes == 2:
return cmap.copy()
return ListedColormap([f'C{i}' for i in range(n_classes)])
return ListedColormap([f"C{i}" for i in range(n_classes)])


def set_plot_style():
Expand All @@ -34,7 +35,7 @@ def set_plot_style():

def twospirals(n_samples, noise=0.5, rng=rng):
"""
Returns the two spirals dataset.
Returns the two spirals dataset.
"""
n = np.sqrt(rng.uniform(size=(n_samples, 1))) * 360 * (2 * np.pi) / 360
d1x = -np.cos(n) * n + rng.uniform((n_samples, 1)) * noise
Expand Down Expand Up @@ -67,7 +68,9 @@ def draw_linear_regression_function(reg, ax=None, **kwargs):
def plot_3d_views(X, y, cmap=cmap):
from mpl_toolkits.mplot3d import Axes3D # noqa

fig, axs = plt.subplots(2, 2, subplot_kw={'projection': '3d'}, constrained_layout=False)
fig, axs = plt.subplots(
2, 2, subplot_kw={"projection": "3d"}, constrained_layout=False
)

for ax in axs.ravel():
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=cmap, lw=0)
Expand All @@ -83,6 +86,7 @@ def plot_3d_views(X, y, cmap=cmap):
axs[1, 1].view_init(90, 0)
fig.subplots_adjust(wspace=0.005, hspace=0.005)


def draw_tree(clf):
import pydotplus

Expand Down Expand Up @@ -176,7 +180,7 @@ def plot_bars_and_confusion(
axes=None,
vmin=None,
vmax=None,
cmap='inferno',
cmap="inferno",
title=None,
bar_color=None,
):
Expand All @@ -189,7 +193,7 @@ def plot_bars_and_confusion(
if not isinstance(prediction, pd.Series):
prediction = pd.Series(prediction)

correct = pd.Series(np.where(truth.values == prediction.values, 'Correct', 'Wrong'))
correct = pd.Series(np.where(truth.values == prediction.values, "Correct", "Wrong"))

truth.sort_index(inplace=True)
prediction.sort_index(inplace=True)
Expand Down
15 changes: 8 additions & 7 deletions ml/solutions/exercise_1.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model

np.random.seed(1234)
# create two gaussians
A = np.random.multivariate_normal(mean=[1, 1], cov=[[2, 1], [1, 2]], size=200)
Expand All @@ -21,16 +22,16 @@
x2s = (0.5 - b_0 - b_1 * x1s) / b_2


plt.scatter(A[:, 0], A[:, 1], s=25, color='dodgerblue', label='True class A')
plt.scatter(B[:, 0], B[:, 1], s=25, color='limegreen', label='True class B')
plt.scatter(A[:, 0], A[:, 1], s=25, color="dodgerblue", label="True class A")
plt.scatter(B[:, 0], B[:, 1], s=25, color="limegreen", label="True class B")

plt.plot(x1s, x2s, color='gray', linestyle='--')
plt.plot(x1s, x2s, color="gray", linestyle="--")

plt.fill_between(x1s, x2s, 10, color='dodgerblue', alpha=0.07)
plt.fill_between(x1s, x2s, -10, color='limegreen', alpha=0.07)
plt.fill_between(x1s, x2s, 10, color="dodgerblue", alpha=0.07)
plt.fill_between(x1s, x2s, -10, color="limegreen", alpha=0.07)
plt.grid()
plt.xlabel('X1')
plt.ylabel('X2')
plt.xlabel("X1")
plt.ylabel("X2")
plt.margins(x=0, y=0)
plt.xlim([-8, 8])
plt.ylim([-8, 8])
Expand Down
20 changes: 12 additions & 8 deletions ml/solutions/exercise_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,31 @@
np.random.seed(1234)
data = read_titanic()

X = data[['Sex_Code', 'Pclass_Code', 'Fare', 'Age']]
y = data['Survived_Code']
X = data[["Sex_Code", "Pclass_Code", "Fare", "Age"]]
y = data["Survived_Code"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)


# Use linear kernel
reg = SVC(kernel='linear')
reg = SVC(kernel="linear")
reg.fit(X_train, y_train)
prediction_linear = reg.predict(X_test)

# Use the rbf kernel
reg_rbf = SVC(kernel='rbf')
reg_rbf = SVC(kernel="rbf")
reg_rbf.fit(X_train, y_train)
prediction_rbf = reg_rbf.predict(X_test)

fig, ([ax1, ax2], [ax3, ax4]) = plt.subplots(2, 2, figsize=(10, 10))
plots.plot_bars_and_confusion(truth=y_test, prediction=prediction_linear, axes=[ax1, ax2], vmin=0, vmax=182)
plots.plot_bars_and_confusion(truth=y_test, prediction=prediction_rbf, axes=[ax3, ax4], vmin=0, vmax=182)
ax1.set_title('Linear Kernel')
ax3.set_title('Radial Kernel')
plots.plot_bars_and_confusion(
truth=y_test, prediction=prediction_linear, axes=[ax1, ax2], vmin=0, vmax=182
)
plots.plot_bars_and_confusion(
truth=y_test, prediction=prediction_rbf, axes=[ax3, ax4], vmin=0, vmax=182
)
ax1.set_title("Linear Kernel")
ax3.set_title("Radial Kernel")
ax1.set_xlim([0, 300])
ax3.set_xlim([0, 300])
None
18 changes: 11 additions & 7 deletions ml/solutions/exercise_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,27 @@
import seaborn as sns
import pandas as pd
import numpy as np

np.random.seed(1235)

data = read_titanic()

X = data[['Sex_Code', 'Pclass_Code', 'Fare', 'Age']]
y = data['Survived']
X = data[["Sex_Code", "Pclass_Code", "Fare", "Age"]]
y = data["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

df = pd.DataFrame()
ps = ParameterGrid({'max_depth':range(1, 20), 'criterion':['entropy', 'gini']})
ps = ParameterGrid({"max_depth": range(1, 20), "criterion": ["entropy", "gini"]})
for d in ps:
clf = DecisionTreeClassifier(max_depth=d['max_depth'], criterion=d['criterion'])
clf = DecisionTreeClassifier(max_depth=d["max_depth"], criterion=d["criterion"])
clf.fit(X_train, y_train)
acc = accuracy_score(y_test, clf.predict(X_test))
df = df.append({'max_depth': d['max_depth'], 'criterion': d['criterion'], 'accuracy': acc}, ignore_index=True)
df = df.append(
{"max_depth": d["max_depth"], "criterion": d["criterion"], "accuracy": acc},
ignore_index=True,
)

df = df.pivot('max_depth', 'criterion', 'accuracy')
sns.heatmap(df, cmap='YlOrRd', annot=True, fmt='.3f')
df = df.pivot("max_depth", "criterion", "accuracy")
sns.heatmap(df, cmap="YlOrRd", annot=True, fmt=".3f")
None
24 changes: 13 additions & 11 deletions ml/solutions/exercise_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,27 @@
np.random.seed(1234)
data = read_titanic()

X = data[['Sex_Code', 'Pclass_Code', 'Fare', 'Age']]
y = data['Survived_Code']
X = data[["Sex_Code", "Pclass_Code", "Fare", "Age"]]
y = data["Survived_Code"]

svc = SVC(kernel='linear')
svc = SVC(kernel="linear")
knn = KNeighborsClassifier(n_neighbors=5)
tree = DecisionTreeClassifier(max_depth=5)

results = []
for clf, name in zip([svc, knn, tree], ['SVM', 'kNN', 'tree']):
r = cross_validate(clf, X=X, y=y, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'])
for clf, name in zip([svc, knn, tree], ["SVM", "kNN", "tree"]):
r = cross_validate(
clf, X=X, y=y, cv=5, scoring=["accuracy", "precision", "recall", "f1"]
)
df = pd.DataFrame().from_dict(r)
df['classifier'] = name
df["classifier"] = name
results.append(df)

df = pd.concat(results).drop(['fit_time', 'score_time'], axis='columns')
df = pd.concat(results).drop(["fit_time", "score_time"], axis="columns")

means = df.groupby('classifier').mean()
deviations = df.groupby('classifier').std()
means = df.groupby("classifier").mean()
deviations = df.groupby("classifier").std()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 4))
sns.heatmap(means, cmap='viridis', annot=True, ax=ax1, vmin=0, vmax=1)
sns.heatmap(deviations, cmap='viridis', annot=True, ax=ax2, vmin=0, vmax=1)
sns.heatmap(means, cmap="viridis", annot=True, ax=ax1, vmin=0, vmax=1)
sns.heatmap(deviations, cmap="viridis", annot=True, ax=ax2, vmin=0, vmax=1)
Loading