Skip to content

Commit 32fbb31

Browse files
committed
Changes to admin reports
1 parent a8c7104 commit 32fbb31

28 files changed

+3017
-3018
lines changed

app/classifier/text/text_classifier.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def run_on_file(self, input_filename, output_filename, user_id, project_id, labe
116116

117117
def run_model_on_file(input_filename, output_filename, user_id, project_id, label_id=None, method='bow'):
118118
# rf = RandomForestClassifier(verbose=True, class_weight='balanced')
119-
lr = LogisticRegression(verbose=True, class_weight='balanced')
119+
lr = LogisticRegression(verbose=True, class_weight='balanced', random_state=0, penalty='l1', C=10000)
120120
clf = TextClassifier(model=lr)
121121
# pipeline functions are applied sequentially by order of appearance
122122
pipeline = [('base processing', {'col': 'text', 'new_col': 'processed_text'}),

app/server/api.py

Lines changed: 38 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -142,16 +142,6 @@ class UserList(generics.ListCreateAPIView):
142142
class LabelersListAPI(APIView):
143143
pagination_class = None
144144
permission_classes = (IsAuthenticated, IsProjectUser, IsAdminUserAndWriteOnly)
145-
def render_agreement_matrix(self):
146-
return ''
147-
148-
def get_truth_agreement(self):
149-
cursor = connection.cursor()
150-
151-
return 42
152-
153-
def get_labelers_agreement(self):
154-
return 42
155145

156146
def get(self, request, *args, **kwargs):
157147
# def get_annotations
@@ -231,6 +221,7 @@ def plot_agreement_matrix(agreement):
231221

232222

233223
annotations_df = get_annotations(cursor, project_id)
224+
annotations_df.to_csv(r'C:\Users\omri.allouche\Downloads\labeler_agreement.csv')
234225
annotations_df = annotations_df.drop_duplicates(['document_id', 'user_id'])
235226
annotations_df['is_correct'] = [int(x) for x in annotations_df['label_id']==annotations_df['true_label_id']]
236227
user_truth_agreement = annotations_df[ pd.notnull(annotations_df['true_label_id']) ].groupby('user_id')['is_correct'].agg(['count', 'mean'])
@@ -246,7 +237,9 @@ def plot_agreement_matrix(agreement):
246237
users['agreement_with_truth'] = user_truth_agreement['mean']
247238
users = users.reset_index()
248239

240+
num_truth_annotations = annotations_df['true_label_id'].count()
249241
response = {
242+
'num_truth_annotations': num_truth_annotations,
250243
'users': users.fillna(0).T.to_dict(),
251244
'document_agreement': documents_agreement_df.fillna(0).T.to_dict(),
252245
'matrix': plot_agreement_matrix(users_agreement_kappa),
@@ -285,8 +278,9 @@ def get(self, request, *args, **kwargs):
285278
server_documentannotation.label_id
286279
FROM
287280
server_document
288-
LEFT JOIN server_documentannotation ON server_documentannotation.document_id = server_document.id AND server_documentannotation.user_id = %s
289-
WHERE server_document.project_id = %s''' % (str(request.user.id), str(self.kwargs['project_id']))
281+
LEFT JOIN server_documentannotation ON server_documentannotation.document_id = server_document.id
282+
-- AND server_documentannotation.user_id = {user_id}
283+
WHERE server_document.project_id = {project_id}'''.format(user_id=request.user.id, project_id=project_id)
290284

291285
doc_annotations_gold_query = '''SELECT
292286
server_document.id,
@@ -295,17 +289,21 @@ def get(self, request, *args, **kwargs):
295289
FROM
296290
server_document
297291
LEFT JOIN server_documentgoldannotation ON server_documentgoldannotation.document_id = server_document.id
298-
WHERE server_document.project_id =''' + str(self.kwargs['project_id'])
292+
WHERE server_document.project_id = {project_id}'''.format(project_id=project_id)
299293

300294
if not os.path.isdir(ML_FOLDER):
301295
os.makedirs(ML_FOLDER)
302-
with open(os.path.join(ML_FOLDER, INPUT_FILE), 'w', encoding='utf-8', newline='') as outfile:
303-
wr = csv.writer(outfile, quoting=csv.QUOTE_ALL)
304-
wr.writerow(['document_id', 'text', 'label_id'])
305-
cursor.execute(doc_annotations_query)
306-
for row in cursor.fetchall():
307-
label_id = None
308-
wr.writerow([row[0], row[1], row[2]])
296+
297+
cursor.execute(doc_annotations_gold_query)
298+
gold_annotations = cursor.fetchall()
299+
cursor.execute(doc_annotations_query)
300+
user_annotations = cursor.fetchall()
301+
302+
annotations = gold_annotations + user_annotations
303+
304+
df = pd.DataFrame(annotations, columns=['document_id', 'text', 'label_id'])
305+
df = df.drop_duplicates(['document_id'])
306+
df.to_csv(os.path.join(ML_FOLDER, INPUT_FILE), encoding='utf-8')
309307

310308
result = run_model_on_file(os.path.join(ML_FOLDER, INPUT_FILE), os.path.join(ML_FOLDER, OUTPUT_FILE), user_id=0, project_id=project_id)
311309

@@ -349,30 +347,40 @@ def get(self, request, *args, **kwargs):
349347
class DocumentExplainAPI(generics.RetrieveUpdateDestroyAPIView):
350348
project_id = 999 # TODO: Change this to the actual current project
351349
pagination_class = None
352-
permission_classes = (IsAuthenticated, IsProjectUser, IsAdminUser)
350+
permission_classes = (IsAuthenticated, IsProjectUser)
353351
class_weights = None
354352
filename = 'ml_models/ml_logistic_regression_weights_{project_id}.csv'.format(project_id=project_id)
355353
has_weights = False
356-
if (os.path.isfile(filename)):
357-
class_weights = pd.read_csv(os.path.abspath(filename), header=None,
358-
names=['term', 'weight']).set_index('term')['weight']
359-
has_weights = True
360-
354+
355+
def get_class_weights(self):
356+
if not self.has_weights:
357+
self.set_class_weights()
358+
return self.class_weights
359+
360+
def set_class_weights(self):
361+
if (os.path.isfile(self.filename)):
362+
data = pd.read_csv(os.path.abspath(self.filename), header=None, names=['term', 'weight'])
363+
data['term'] = data['term'].str.replace('processed_text_w_', '')
364+
self.class_weights = data.set_index('term')['weight']
365+
self.has_weights = True
366+
361367
def get(self, request, *args, **kwargs):
362368
d = get_object_or_404(Document, pk=self.kwargs['doc_id'])
363369
doc_text_splited = d.text.split(' ')
364370
format_str_positive = '<span class="has-background-success">{}</span>'
365371
format_str_negative = '<span class="has-background-danger">{}</span>'
366372
text = []
367-
if self.has_weights:
373+
class_weights = self.get_class_weights()
374+
if class_weights is not None:
368375
for w in doc_text_splited:
369-
weight = self.class_weights.get(w.lower().replace(',','').replace('.',''), 0)
376+
weight = class_weights.get(w.lower().replace(',','').replace('.',''), 0)
370377
if weight < -0.2:
371378
text.append(format_str_negative.format(w))
372379
elif weight > 0.2:
373380
text.append(format_str_positive.format(w))
374381
else:
375382
text.append(w)
383+
376384
response = {'document': ' '.join(text)}
377385
# doc_text_splited = [w if np.abs(self.class_weights.get(w,0))<0.2 else format_str.format(w) for w in doc_text_splited]
378386
# doc_text_splited[0] = '<span class="has-background-primary">' + doc_text_splited[0] + '</span>'
@@ -465,7 +473,8 @@ def get_queryset(self):
465473

466474
if self.request.query_params.get('is_checked'):
467475
is_null = self.request.query_params.get('is_checked') == 'true'
468-
queryset = project.get_documents(is_null).distinct()
476+
print(int(is_null))
477+
queryset = project.get_documents(is_null=is_null, user=self.request.user.id).distinct()
469478

470479
if (project.use_machine_model_sort):
471480
queryset = queryset.order_by('doc_mlm_annotations__prob').filter(project=self.kwargs['project_id']).exclude(doc_mlm_annotations__prob__isnull=True)

app/server/labelers_comparison_functions.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,13 @@ def create_kappa_comparison_df(labelers_df, filter_double_score=False):
4646
comparison_df = pd.DataFrame(index=col_list,columns=col_list)
4747
for name1 in col_list:
4848
for name2 in col_list:
49-
set1 = labelers_df[name1].astype('str')
50-
set2 = labelers_df[name2].astype('str')
49+
if name1==name2:
50+
comparison_df.loc[name1, name2] = 1
51+
continue
52+
53+
temp_df = labelers_df[[name1, name2]].dropna(axis='index')
54+
set1 = temp_df.loc[:,name1]
55+
set2 = temp_df.loc[:,name2]
5156
score = cohen_kappa_score(set1, set2)
5257
comparison_df.loc[name1, name2] = score
5358
comparison_df.loc[name2, name1] = score
@@ -85,8 +90,10 @@ def calc_agreement(labelers_df, y):
8590
'''
8691
labeler_cols = [c for c in labelers_df.columns if c!=y]
8792
def calc_agreement_row(x):
88-
values = x.loc[labeler_cols]
8993
true_y = x.loc[y]
94+
if pd.isnull(true_y):
95+
return None
96+
values = [v for v in x.loc[labeler_cols] if pd.notnull(v)]
9097
return (values==true_y).mean()
9198
return labelers_df.apply(calc_agreement_row, axis=1)
9299

@@ -96,7 +103,7 @@ def calc_entropy(labelers_df):
96103
:param labelers_df: a df in which each column is a labeler and each row a sample
97104
:return: a pd.Series of the entropy score of each samples
98105
'''
99-
classes = np.unique(labelers_df)
106+
classes = [v for v in np.unique(labelers_df) if pd.notnull(v)]
100107
return labelers_df.apply(lambda x: sp.stats.entropy([list(x).count(c) for c in classes]), axis=1)
101108

102109

@@ -107,9 +114,9 @@ def add_agreement_columns(labelers_df,y=None):
107114
:return: the labelers_df with 3 or 4 new columns
108115
'''
109116
df_copy = labelers_df.copy()
110-
cols = df_copy.columns
111117
if y != None:
112-
df_copy['true_agreement_prop'] = calc_agreement(df_copy[cols], y)
118+
df_copy['true_agreement_prop'] = calc_agreement(df_copy, y)
119+
cols = [c for c in df_copy.columns if isinstance(c, int)]
113120
df_copy['most_common'] = find_most_common_labeling(df_copy[cols])
114121
df_copy['most_common_agreement_prop'] = calc_agreement(df_copy[list(cols) + ['most_common']], 'most_common')
115122
df_copy['entropy'] = calc_entropy(df_copy[cols])
@@ -200,6 +207,16 @@ def train_labelers_based_model(labelers_df, y):
200207

201208

202209
if __name__ == '__main__':
203-
df = pd.read_csv(r'C:\Users\omri.allouche\Downloads\labeler_agreement.csv')
204-
pivot_table = df.pivot(index='document_id', columns='user_id', values='label_id')
205-
create_kappa_comparison_df(pivot_table)
210+
annotations_df = pd.read_csv(r'C:\Users\omri.allouche\Downloads\labeler_agreement.csv')
211+
annotations_df = annotations_df.drop_duplicates(['document_id', 'user_id'])
212+
annotations_df['is_correct'] = [int(x) for x in annotations_df['label_id'] == annotations_df['true_label_id']]
213+
user_truth_agreement = annotations_df[pd.notnull(annotations_df['true_label_id'])].groupby('user_id')[
214+
'is_correct'].agg(['count', 'mean'])
215+
216+
document_annotations_by_labeler = annotations_df.pivot(index='document_id', columns='user_id', values='label_id')
217+
document_annotations_by_labeler = pd.merge(left=document_annotations_by_labeler,
218+
right=annotations_df.set_index('document_id')[['true_label_id']],
219+
left_index=True, right_index=True)
220+
documents_agreement_df = add_agreement_columns(document_annotations_by_labeler, 'true_label_id')
221+
users_agreement_kappa = create_kappa_comparison_df(document_annotations_by_labeler)
222+
average_kappa_agreement_per_labeler = compute_average_agreement_per_labeler(users_agreement_kappa)

app/server/models.py

Lines changed: 5 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -45,25 +45,6 @@ class Project(models.Model):
4545
'document_serializer': '',
4646
'annotations_serializer': '',
4747
},
48-
49-
# 'DocumentClassification': {
50-
# 'title': '',
51-
# 'type': '',
52-
# 'image': '',
53-
# 'template_html': '',
54-
# 'document_serializer': '',
55-
# 'annotations_serializer': '',
56-
# },
57-
#
58-
# 'DocumentClassification': {
59-
# 'title': '',
60-
# 'type': '',
61-
# 'image': '',
62-
# 'template_html': '',
63-
# 'document_serializer': '',
64-
# 'annotations_serializer': '',
65-
# },
66-
6748
}
6849
DOCUMENT_CLASSIFICATION = 'DocumentClassification'
6950
SEQUENCE_LABELING = 'SequenceLabeling'
@@ -92,7 +73,8 @@ def get_absolute_url(self):
9273
return reverse('upload', args=[self.id])
9374

9475
def is_type_of(self, project_type):
95-
return project_type == self.project_type
76+
# return project_type == self.project_type
77+
return self.project_types[ self.project_type ]['type'] == project_type
9678

9779
def get_progress(self, user):
9880
docs = self.get_documents(is_null=True, user=user)
@@ -103,26 +85,10 @@ def get_progress(self, user):
10385
@property
10486
def image(self):
10587
url = self.project_types[ self.project_type ]['image']
106-
# if self.is_type_of(self.DOCUMENT_CLASSIFICATION):
107-
# url = staticfiles_storage.url('images/cat-1045782_640.jpg')
108-
# elif self.is_type_of(self.SEQUENCE_LABELING):
109-
# url = staticfiles_storage.url('images/cat-3449999_640.jpg')
110-
# elif self.is_type_of(self.Seq2seq):
111-
# url = staticfiles_storage.url('images/tiger-768574_640.jpg')
112-
11388
return url
11489

11590
def get_template_name(self):
11691
template_name = self.project_types[ self.project_type ]['template_html']
117-
# if self.is_type_of(Project.DOCUMENT_CLASSIFICATION):
118-
# template_name = 'annotation/document_classification.html'
119-
# elif self.is_type_of(Project.SEQUENCE_LABELING):
120-
# template_name = 'annotation/sequence_labeling.html'
121-
# elif self.is_type_of(Project.Seq2seq):
122-
# template_name = 'annotation/seq2seq.html'
123-
# else:
124-
# raise ValueError('Template does not exist')
125-
12692
return template_name
12793

12894
def get_mlm_user(self):
@@ -156,8 +122,8 @@ def get_documents(self, is_null=True, user=None):
156122
if self.is_type_of(Project.DOCUMENT_CLASSIFICATION):
157123
if user:
158124
docs = docs.exclude(doc_annotations__user=user)
159-
else:
160-
docs = docs.filter(doc_annotations__isnull=is_null)
125+
# else:
126+
# docs = docs.filter(doc_annotations__isnull=is_null)
161127
elif self.is_type_of(Project.SEQUENCE_LABELING):
162128
if user:
163129
docs = docs.exclude(seq_annotations__user=user)
@@ -169,6 +135,7 @@ def get_documents(self, is_null=True, user=None):
169135
else:
170136
docs = docs.filter(seq2seq_annotations__isnull=is_null)
171137
else:
138+
print('Project type: '+self.project_type)
172139
raise ValueError('Invalid project_type')
173140

174141
return docs

0 commit comments

Comments
 (0)