Skip to content

Commit ee728b6

Browse files
authored
Merge pull request #27 from bigdata-ustc/IRR
[FEATURE] Item Response Ranking with DINA, MIRT and NCDM
2 parents 8e562d8 + e3cbb87 commit ee728b6

File tree

23 files changed

+1227
-282
lines changed

23 files changed

+1227
-282
lines changed

CHANGE.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
v0.0.10:
22
* add STE operator in DINA
33
* add Multidimensional Item Response Theory (MIRT)
4+
* add IRR-DINA, IRR-MIRT, IRR-NCDM
45

56
v0.0.9:
67
* add Item Response Ranking for Cognitive Diagnosis (IRR)

EduCDM/IRR/DINA.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# coding: utf-8
2+
# 2021/7/1 @ tongshiwei
3+
4+
import pandas as pd
5+
import numpy as np
6+
import torch
7+
from torch import nn
8+
from EduCDM import GDDINA
9+
from .loss import PairSCELoss, HarmonicLoss, loss_mask
10+
from tqdm import tqdm
11+
from longling.ML.metrics import ranking_report
12+
13+
14+
class DINA(GDDINA):
15+
def __init__(self, user_num, item_num, knowledge_num, ste=False, zeta=0.5):
16+
super(DINA, self).__init__(user_num, item_num, knowledge_num, ste)
17+
self.zeta = zeta
18+
19+
def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...:
20+
point_loss_function = nn.BCELoss()
21+
pair_loss_function = PairSCELoss()
22+
loss_function = HarmonicLoss(self.zeta)
23+
24+
trainer = torch.optim.Adam(self.dina_net.parameters(), lr, weight_decay=1e-4)
25+
26+
for e in range(epoch):
27+
point_losses = []
28+
pair_losses = []
29+
losses = []
30+
for batch_data in tqdm(train_data, "Epoch %s" % e):
31+
user_id, item_id, knowledge, score, n_samples, *neg_users = batch_data
32+
user_id: torch.Tensor = user_id.to(device)
33+
item_id: torch.Tensor = item_id.to(device)
34+
knowledge: torch.Tensor = knowledge.to(device)
35+
predicted_pos_score: torch.Tensor = self.dina_net(user_id, item_id, knowledge)
36+
score: torch.Tensor = score.to(device)
37+
neg_score = 1 - score
38+
39+
point_loss = point_loss_function(predicted_pos_score, score)
40+
predicted_neg_scores = []
41+
if neg_users:
42+
for neg_user in neg_users:
43+
predicted_neg_score = self.dina_net(neg_user, item_id, knowledge)
44+
predicted_neg_scores.append(predicted_neg_score)
45+
46+
# prediction loss
47+
pair_pred_loss_list = []
48+
for i, predicted_neg_score in enumerate(predicted_neg_scores):
49+
pair_pred_loss_list.append(
50+
pair_loss_function(
51+
predicted_pos_score,
52+
predicted_neg_score,
53+
score - neg_score
54+
)
55+
)
56+
57+
pair_loss = sum(loss_mask(pair_pred_loss_list, n_samples))
58+
else:
59+
pair_loss = 0
60+
61+
loss = loss_function(point_loss, pair_loss)
62+
63+
# back propagation
64+
trainer.zero_grad()
65+
loss.backward()
66+
trainer.step()
67+
68+
point_losses.append(point_loss.mean().item())
69+
pair_losses.append(pair_loss.mean().item() if not isinstance(pair_loss, int) else pair_loss)
70+
losses.append(loss.item())
71+
print(
72+
"[Epoch %d] Loss: %.6f, PointLoss: %.6f, PairLoss: %.6f" % (
73+
e, float(np.mean(losses)), float(np.mean(point_losses)), float(np.mean(pair_losses))
74+
)
75+
)
76+
77+
if test_data is not None:
78+
eval_data = self.eval(test_data)
79+
print("[Epoch %d]\n%s" % (e, eval_data))
80+
81+
def eval(self, test_data, device="cpu"):
82+
self.dina_net.eval()
83+
y_pred = []
84+
y_true = []
85+
items = []
86+
for batch_data in tqdm(test_data, "evaluating"):
87+
user_id, item_id, knowledge, response = batch_data
88+
user_id: torch.Tensor = user_id.to(device)
89+
item_id: torch.Tensor = item_id.to(device)
90+
pred: torch.Tensor = self.dina_net(user_id, item_id, knowledge)
91+
y_pred.extend(pred.tolist())
92+
y_true.extend(response.tolist())
93+
items.extend(item_id.tolist())
94+
95+
df = pd.DataFrame({
96+
"item_id": items,
97+
"score": y_true,
98+
"pred": y_pred,
99+
})
100+
101+
ground_truth = []
102+
prediction = []
103+
104+
for _, group_df in tqdm(df.groupby("item_id"), "formatting item df"):
105+
ground_truth.append(group_df["score"].values)
106+
prediction.append(group_df["pred"].values)
107+
108+
self.dina_net.train()
109+
110+
return ranking_report(
111+
ground_truth,
112+
y_pred=prediction,
113+
coerce="padding"
114+
)

EduCDM/IRR/IRT.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import numpy as np
99
import pandas as pd
1010
from .loss import PairSCELoss, HarmonicLoss, loss_mask
11-
from .metrics import ranking_report, result_format
11+
from longling.ML.metrics import ranking_report
1212

1313
__all__ = ["IRT"]
1414

@@ -78,9 +78,9 @@ def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.00
7878

7979
if test_data is not None:
8080
eval_data = self.eval(test_data)
81-
print("[Epoch %d]\n%s" % (e, result_format(eval_data)))
81+
print("[Epoch %d]\n%s" % (e, eval_data))
8282

83-
def eval(self, test_data, device="cpu") -> tuple:
83+
def eval(self, test_data, device="cpu"):
8484
self.irt_net.eval()
8585
y_pred = []
8686
y_true = []
@@ -112,6 +112,5 @@ def eval(self, test_data, device="cpu") -> tuple:
112112
return ranking_report(
113113
ground_truth,
114114
y_pred=prediction,
115-
# coerce="abandon",
116115
coerce="padding"
117116
)

EduCDM/IRR/MIRT.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# coding: utf-8
2+
# 2021/7/1 @ tongshiwei
3+
4+
5+
import torch
6+
from torch import nn
7+
from tqdm import tqdm
8+
from EduCDM import MIRT as PointMIRT
9+
import numpy as np
10+
import pandas as pd
11+
from .loss import PairSCELoss, HarmonicLoss, loss_mask
12+
from longling.ML.metrics import ranking_report
13+
14+
__all__ = ["MIRT"]
15+
16+
17+
class MIRT(PointMIRT):
18+
def __init__(self, user_num, item_num, knowledge_num, latent_dim=None, zeta=0.5):
19+
latent_dim = knowledge_num if latent_dim is None else latent_dim
20+
super(MIRT, self).__init__(user_num, item_num, latent_dim)
21+
self.knowledge_num = knowledge_num
22+
self.zeta = zeta
23+
24+
def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...:
25+
point_loss_function = nn.BCELoss()
26+
pair_loss_function = PairSCELoss()
27+
loss_function = HarmonicLoss(self.zeta)
28+
29+
trainer = torch.optim.Adam(self.irt_net.parameters(), lr, weight_decay=1e-4)
30+
31+
for e in range(epoch):
32+
point_losses = []
33+
pair_losses = []
34+
losses = []
35+
for batch_data in tqdm(train_data, "Epoch %s" % e):
36+
user_id, item_id, _, score, n_samples, *neg_users = batch_data
37+
user_id: torch.Tensor = user_id.to(device)
38+
item_id: torch.Tensor = item_id.to(device)
39+
predicted_pos_score: torch.Tensor = self.irt_net(user_id, item_id)
40+
score: torch.Tensor = score.to(device)
41+
neg_score = 1 - score
42+
43+
point_loss = point_loss_function(predicted_pos_score, score)
44+
predicted_neg_scores = []
45+
if neg_users:
46+
for neg_user in neg_users:
47+
predicted_neg_score = self.irt_net(neg_user, item_id)
48+
predicted_neg_scores.append(predicted_neg_score)
49+
50+
# prediction loss
51+
pair_pred_loss_list = []
52+
for i, predicted_neg_score in enumerate(predicted_neg_scores):
53+
pair_pred_loss_list.append(
54+
pair_loss_function(
55+
predicted_pos_score,
56+
predicted_neg_score,
57+
score - neg_score
58+
)
59+
)
60+
61+
pair_loss = sum(loss_mask(pair_pred_loss_list, n_samples))
62+
else:
63+
pair_loss = 0
64+
65+
loss = loss_function(point_loss, pair_loss)
66+
67+
# back propagation
68+
trainer.zero_grad()
69+
loss.backward()
70+
trainer.step()
71+
72+
point_losses.append(point_loss.mean().item())
73+
pair_losses.append(pair_loss.mean().item() if not isinstance(pair_loss, int) else pair_loss)
74+
losses.append(loss.item())
75+
print(
76+
"[Epoch %d] Loss: %.6f, PointLoss: %.6f, PairLoss: %.6f" % (
77+
e, float(np.mean(losses)), float(np.mean(point_losses)), float(np.mean(pair_losses))
78+
)
79+
)
80+
81+
if test_data is not None:
82+
eval_data = self.eval(test_data)
83+
print("[Epoch %d]\n%s" % (e, eval_data))
84+
85+
def eval(self, test_data, device="cpu"):
86+
self.irt_net.eval()
87+
y_pred = []
88+
y_true = []
89+
items = []
90+
for batch_data in tqdm(test_data, "evaluating"):
91+
user_id, item_id, _, response = batch_data
92+
user_id: torch.Tensor = user_id.to(device)
93+
item_id: torch.Tensor = item_id.to(device)
94+
pred: torch.Tensor = self.irt_net(user_id, item_id)
95+
y_pred.extend(pred.tolist())
96+
y_true.extend(response.tolist())
97+
items.extend(item_id.tolist())
98+
99+
df = pd.DataFrame({
100+
"item_id": items,
101+
"score": y_true,
102+
"pred": y_pred,
103+
})
104+
105+
ground_truth = []
106+
prediction = []
107+
108+
for _, group_df in tqdm(df.groupby("item_id"), "formatting item df"):
109+
ground_truth.append(group_df["score"].values)
110+
prediction.append(group_df["pred"].values)
111+
112+
self.irt_net.train()
113+
114+
return ranking_report(
115+
ground_truth,
116+
y_pred=prediction,
117+
coerce="padding"
118+
)

EduCDM/IRR/NCDM.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# coding: utf-8
2+
# 2021/7/1 @ tongshiwei
3+
4+
import pandas as pd
5+
import numpy as np
6+
import torch
7+
from torch import nn
8+
from EduCDM import NCDM as PointNCDM
9+
from .loss import PairSCELoss, HarmonicLoss, loss_mask
10+
from tqdm import tqdm
11+
from longling.ML.metrics import ranking_report
12+
13+
14+
class NCDM(PointNCDM):
15+
def __init__(self, user_num, item_num, knowledge_num, zeta=0.5):
16+
super(NCDM, self).__init__(knowledge_num, item_num, user_num)
17+
self.zeta = zeta
18+
19+
def train(self, train_data, test_data=None, epoch=10, device="cpu", lr=0.002, silence=False) -> ...:
20+
point_loss_function = nn.BCELoss()
21+
pair_loss_function = PairSCELoss()
22+
loss_function = HarmonicLoss(self.zeta)
23+
24+
trainer = torch.optim.Adam(self.ncdm_net.parameters(), lr, weight_decay=1e-4)
25+
26+
for e in range(epoch):
27+
point_losses = []
28+
pair_losses = []
29+
losses = []
30+
for batch_data in tqdm(train_data, "Epoch %s" % e):
31+
user_id, item_id, knowledge, score, n_samples, *neg_users = batch_data
32+
user_id: torch.Tensor = user_id.to(device)
33+
item_id: torch.Tensor = item_id.to(device)
34+
knowledge: torch.Tensor = knowledge.to(device)
35+
predicted_pos_score: torch.Tensor = self.ncdm_net(user_id, item_id, knowledge)
36+
score: torch.Tensor = score.to(device)
37+
neg_score = 1 - score
38+
39+
point_loss = point_loss_function(predicted_pos_score, score)
40+
predicted_neg_scores = []
41+
if neg_users:
42+
for neg_user in neg_users:
43+
predicted_neg_score = self.ncdm_net(neg_user, item_id, knowledge)
44+
predicted_neg_scores.append(predicted_neg_score)
45+
46+
# prediction loss
47+
pair_pred_loss_list = []
48+
for i, predicted_neg_score in enumerate(predicted_neg_scores):
49+
pair_pred_loss_list.append(
50+
pair_loss_function(
51+
predicted_pos_score,
52+
predicted_neg_score,
53+
score - neg_score
54+
)
55+
)
56+
57+
pair_loss = sum(loss_mask(pair_pred_loss_list, n_samples))
58+
else:
59+
pair_loss = 0
60+
61+
loss = loss_function(point_loss, pair_loss)
62+
63+
# back propagation
64+
trainer.zero_grad()
65+
loss.backward()
66+
trainer.step()
67+
68+
point_losses.append(point_loss.mean().item())
69+
pair_losses.append(pair_loss.mean().item() if not isinstance(pair_loss, int) else pair_loss)
70+
losses.append(loss.item())
71+
print(
72+
"[Epoch %d] Loss: %.6f, PointLoss: %.6f, PairLoss: %.6f" % (
73+
e, float(np.mean(losses)), float(np.mean(point_losses)), float(np.mean(pair_losses))
74+
)
75+
)
76+
77+
if test_data is not None:
78+
eval_data = self.eval(test_data)
79+
print("[Epoch %d]\n%s" % (e, eval_data))
80+
81+
def eval(self, test_data, device="cpu"):
82+
self.ncdm_net.eval()
83+
y_pred = []
84+
y_true = []
85+
items = []
86+
for batch_data in tqdm(test_data, "evaluating"):
87+
user_id, item_id, knowledge, response = batch_data
88+
user_id: torch.Tensor = user_id.to(device)
89+
item_id: torch.Tensor = item_id.to(device)
90+
pred: torch.Tensor = self.ncdm_net(user_id, item_id, knowledge)
91+
y_pred.extend(pred.tolist())
92+
y_true.extend(response.tolist())
93+
items.extend(item_id.tolist())
94+
95+
df = pd.DataFrame({
96+
"item_id": items,
97+
"score": y_true,
98+
"pred": y_pred,
99+
})
100+
101+
ground_truth = []
102+
prediction = []
103+
104+
for _, group_df in tqdm(df.groupby("item_id"), "formatting item df"):
105+
ground_truth.append(group_df["score"].values)
106+
prediction.append(group_df["pred"].values)
107+
108+
self.ncdm_net.train()
109+
110+
return ranking_report(
111+
ground_truth,
112+
y_pred=prediction,
113+
coerce="padding"
114+
)

EduCDM/IRR/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,7 @@
22
# 2021/6/19 @ tongshiwei
33

44
from .IRT import IRT
5+
from .DINA import DINA
6+
from .MIRT import MIRT
7+
from .NCDM import NCDM
58
from .etl import point_etl, pair_etl, extract_item

0 commit comments

Comments
 (0)