strelok2012
diff --git a/‎app/classifier/text/text_classifier.py‎
Lines changed: 1 addition & 1 deletion b/‎app/classifier/text/text_classifier.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎app/server/api.py‎
Lines changed: 38 additions & 29 deletions b/‎app/server/api.py‎
Lines changed: 38 additions & 29 deletions
diff --git a/‎app/server/labelers_comparison_functions.py‎
Lines changed: 26 additions & 9 deletions b/‎app/server/labelers_comparison_functions.py‎
Lines changed: 26 additions & 9 deletions
diff --git a/‎app/server/models.py‎
Lines changed: 5 additions & 38 deletions b/‎app/server/models.py‎
Lines changed: 5 additions & 38 deletions
@@ -116,7 +116,7 @@ def run_on_file(self, input_filename, output_filename, user_id, project_id, labe
 
 def run_model_on_file(input_filename, output_filename, user_id, project_id, label_id=None, method='bow'):
     # rf = RandomForestClassifier(verbose=True, class_weight='balanced')
-    lr = LogisticRegression(verbose=True, class_weight='balanced')
+    lr = LogisticRegression(verbose=True, class_weight='balanced', random_state=0, penalty='l1', C=10000)
     clf = TextClassifier(model=lr)
     # pipeline functions are applied sequentially by order of appearance
     pipeline = [('base processing', {'col': 'text', 'new_col': 'processed_text'}),
 
@@ -142,16 +142,6 @@ class UserList(generics.ListCreateAPIView):
 class LabelersListAPI(APIView):
     pagination_class = None
     permission_classes = (IsAuthenticated, IsProjectUser, IsAdminUserAndWriteOnly)
-    def render_agreement_matrix(self):
-        return 'data:image/gif;base64,R0lGODlhPQBEAPeoAJosM//AwO/AwHVYZ/z595kzAP/s7P+goOXMv8+fhw/v739/f+8PD98fH/8mJl+fn/9ZWb8/PzWlwv///6wWGbImAPgTEMImIN9gUFCEm/gDALULDN8PAD6atYdCTX9gUNKlj8wZAKUsAOzZz+UMAOsJAP/Z2ccMDA8PD/95eX5NWvsJCOVNQPtfX/8zM8+QePLl38MGBr8JCP+zs9myn/8GBqwpAP/GxgwJCPny78lzYLgjAJ8vAP9fX/+MjMUcAN8zM/9wcM8ZGcATEL+QePdZWf/29uc/P9cmJu9MTDImIN+/r7+/vz8/P8VNQGNugV8AAF9fX8swMNgTAFlDOICAgPNSUnNWSMQ5MBAQEJE3QPIGAM9AQMqGcG9vb6MhJsEdGM8vLx8fH98AANIWAMuQeL8fABkTEPPQ0OM5OSYdGFl5jo+Pj/+pqcsTE78wMFNGQLYmID4dGPvd3UBAQJmTkP+8vH9QUK+vr8ZWSHpzcJMmILdwcLOGcHRQUHxwcK9PT9DQ0O/v70w5MLypoG8wKOuwsP/g4P/Q0IcwKEswKMl8aJ9fX2xjdOtGRs/Pz+Dg4GImIP8gIH0sKEAwKKmTiKZ8aB/f39Wsl+LFt8dgUE9PT5x5aHBwcP+AgP+WltdgYMyZfyywz78AAAAAAAD///8AAP9mZv///wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACH5BAEAAKgALAAAAAA9AEQAAAj/AFEJHEiwoMGDCBMqXMiwocAbBww4nEhxoYkUpzJGrMixogkfGUNqlNixJEIDB0SqHGmyJSojM1bKZOmyop0gM3Oe2liTISKMOoPy7GnwY9CjIYcSRYm0aVKSLmE6nfq05QycVLPuhDrxBlCtYJUqNAq2bNWEBj6ZXRuyxZyDRtqwnXvkhACDV+euTeJm1Ki7A73qNWtFiF+/gA95Gly2CJLDhwEHMOUAAuOpLYDEgBxZ4GRTlC1fDnpkM+fOqD6DDj1aZpITp0dtGCDhr+fVuCu3zlg49ijaokTZTo27uG7Gjn2P+hI8+PDPERoUB318bWbfAJ5sUNFcuGRTYUqV/3ogfXp1rWlMc6awJjiAAd2fm4ogXjz56aypOoIde4OE5u/F9x199dlXnnGiHZWEYbGpsAEA3QXYnHwEFliKAgswgJ8LPeiUXGwedCAKABACCN+EA1pYIIYaFlcDhytd51sGAJbo3onOpajiihlO92KHGaUXGwWjUBChjSPiWJuOO/LYIm4v1tXfE6J4gCSJEZ7YgRYUNrkji9P55sF/ogxw5ZkSqIDaZBV6aSGYq/lGZplndkckZ98xoICbTcIJGQAZcNmdmUc210hs35nCyJ58fgmIKX5RQGOZowxaZwYA+JaoKQwswGijBV4C6SiTUmpphMspJx9unX4KaimjDv9aaXOEBteBqmuuxgEHoLX6Kqx+yXqqBANsgCtit4FWQAEkrNbpq7HSOmtwag5w57GrmlJBASEU18ADjUYb3ADTinIttsgSB1oJFfA63bduimuqKB1keqwUhoCSK374wbujvOSu4QG6UvxBRydcpKsav++Ca6G8A6Pr1x2kVMyHwsVxUALDq/krnrhPSOzXG1lUTIoffqGR7Goi2MAxbv6O2kEG56I7CSlRsEFKFVyovDJoIRTg7sugNRDGqCJzJgcKE0ywc0ELm6KBCCJo8DIPFeCWNGcyqNFE06ToAfV0HBRgxsvLThHn1oddQMrXj5DyAQgjEHSAJMWZwS3HPxT/QMbabI/iBCliMLEJKX2EEkomBAUCxRi42VDADxyTYDVogV+wSChqmKxEKCDAYFDFj4OmwbY7bDGdBhtrnTQYOigeChUmc1K3QTnAUfEgGFgAWt88hKA6aCRIXhxnQ1yg3BCayK44EWdkUQcBByEQChFXfCB776aQsG0BIlQgQgE8qO26X1h8cEUep8ngRBnOy74E9QgRgEAC8SvOfQkh7FDBDmS43PmGoIiKUUEGkMEC/PJHgxw0xH74yx/3XnaYRJgMB8obxQW6kL9QYEJ0FIFgByfIL7/IQAlvQwEpnAC7DtLNJCKUoO/w45c44GwCXiAFB/OXAATQryUxdN4LfFiwgjCNYg+kYMIEFkCKDs6PKAIJouyGWMS1FSKJOMRB/BoIxYJIUXFUxNwoIkEKPAgCBZSQHQ1A2EWDfDEUVLyADj5AChSIQW6gu10bE/JG2VnCZGfo4R4d0sdQoBAHhPjhIB94v/wRoRKQWGRHgrhGSQJxCS+0pCZbEhAAOw=='
-    
-    def get_truth_agreement(self):
-        cursor = connection.cursor()
-
-        return 42
-
-    def get_labelers_agreement(self):
-        return 42
 
     def get(self, request, *args, **kwargs):
         # def get_annotations
@@ -231,6 +221,7 @@ def plot_agreement_matrix(agreement):
 
 
         annotations_df = get_annotations(cursor, project_id)
+        annotations_df.to_csv(r'C:\Users\omri.allouche\Downloads\labeler_agreement.csv')
         annotations_df = annotations_df.drop_duplicates(['document_id', 'user_id'])
         annotations_df['is_correct'] = [int(x) for x in annotations_df['label_id']==annotations_df['true_label_id']]
         user_truth_agreement = annotations_df[ pd.notnull(annotations_df['true_label_id']) ].groupby('user_id')['is_correct'].agg(['count', 'mean'])
@@ -246,7 +237,9 @@ def plot_agreement_matrix(agreement):
         users['agreement_with_truth'] = user_truth_agreement['mean']
         users = users.reset_index()
 
+        num_truth_annotations = annotations_df['true_label_id'].count()
         response = {
+            'num_truth_annotations': num_truth_annotations,
             'users': users.fillna(0).T.to_dict(),
             'document_agreement': documents_agreement_df.fillna(0).T.to_dict(),
             'matrix': plot_agreement_matrix(users_agreement_kappa),
@@ -285,8 +278,9 @@ def get(self, request, *args, **kwargs):
             server_documentannotation.label_id
             FROM
             server_document
-            LEFT JOIN server_documentannotation ON server_documentannotation.document_id = server_document.id AND server_documentannotation.user_id = %s
-            WHERE server_document.project_id = %s''' % (str(request.user.id), str(self.kwargs['project_id']))
+            LEFT JOIN server_documentannotation ON server_documentannotation.document_id = server_document.id 
+              -- AND server_documentannotation.user_id = {user_id}
+            WHERE server_document.project_id = {project_id}'''.format(user_id=request.user.id, project_id=project_id)
 
         doc_annotations_gold_query = '''SELECT
             server_document.id,
@@ -295,17 +289,21 @@ def get(self, request, *args, **kwargs):
             FROM
             server_document
             LEFT JOIN server_documentgoldannotation ON server_documentgoldannotation.document_id = server_document.id
-            WHERE server_document.project_id =''' + str(self.kwargs['project_id'])
+            WHERE server_document.project_id = {project_id}'''.format(project_id=project_id)
 
         if not os.path.isdir(ML_FOLDER):
             os.makedirs(ML_FOLDER)
-        with open(os.path.join(ML_FOLDER, INPUT_FILE), 'w', encoding='utf-8', newline='') as outfile:
-            wr = csv.writer(outfile, quoting=csv.QUOTE_ALL)
-            wr.writerow(['document_id', 'text', 'label_id'])
-            cursor.execute(doc_annotations_query)
-            for row in cursor.fetchall():
-                label_id = None
-                wr.writerow([row[0], row[1], row[2]])
+
+        cursor.execute(doc_annotations_gold_query)
+        gold_annotations = cursor.fetchall()
+        cursor.execute(doc_annotations_query)
+        user_annotations = cursor.fetchall()
+
+        annotations = gold_annotations + user_annotations
+
+        df = pd.DataFrame(annotations, columns=['document_id', 'text', 'label_id'])
+        df = df.drop_duplicates(['document_id'])
+        df.to_csv(os.path.join(ML_FOLDER, INPUT_FILE), encoding='utf-8')
 
         result = run_model_on_file(os.path.join(ML_FOLDER, INPUT_FILE), os.path.join(ML_FOLDER, OUTPUT_FILE), user_id=0, project_id=project_id)
 
@@ -349,30 +347,40 @@ def get(self, request, *args, **kwargs):
 class DocumentExplainAPI(generics.RetrieveUpdateDestroyAPIView):
     project_id = 999 # TODO: Change this to the actual current project
     pagination_class = None
-    permission_classes = (IsAuthenticated, IsProjectUser, IsAdminUser)
+    permission_classes = (IsAuthenticated, IsProjectUser)
     class_weights = None
     filename = 'ml_models/ml_logistic_regression_weights_{project_id}.csv'.format(project_id=project_id)
     has_weights = False
-    if (os.path.isfile(filename)):
-        class_weights = pd.read_csv(os.path.abspath(filename), header=None,
-                    names=['term', 'weight']).set_index('term')['weight']
-        has_weights = True
-        
+
+    def get_class_weights(self):
+        if not self.has_weights:
+            self.set_class_weights()
+        return self.class_weights
+
+    def set_class_weights(self):
+        if (os.path.isfile(self.filename)):
+            data = pd.read_csv(os.path.abspath(self.filename), header=None, names=['term', 'weight'])
+            data['term'] = data['term'].str.replace('processed_text_w_', '')
+            self.class_weights = data.set_index('term')['weight']
+            self.has_weights = True
+
     def get(self, request, *args, **kwargs):
         d = get_object_or_404(Document, pk=self.kwargs['doc_id'])
         doc_text_splited = d.text.split(' ')
         format_str_positive = '<span class="has-background-success">{}</span>'
         format_str_negative = '<span class="has-background-danger">{}</span>'
         text = []
-        if self.has_weights:
+        class_weights = self.get_class_weights()
+        if class_weights is not None:
             for w in doc_text_splited:
-                weight = self.class_weights.get(w.lower().replace(',','').replace('.',''), 0)
+                weight = class_weights.get(w.lower().replace(',','').replace('.',''), 0)
                 if weight < -0.2:
                     text.append(format_str_negative.format(w))
                 elif weight > 0.2:
                     text.append(format_str_positive.format(w))
                 else:
                     text.append(w)
+
         response = {'document': ' '.join(text)}
         # doc_text_splited = [w if np.abs(self.class_weights.get(w,0))<0.2 else format_str.format(w) for w in doc_text_splited]
         # doc_text_splited[0] = '<span class="has-background-primary">' + doc_text_splited[0] + '</span>'
@@ -465,7 +473,8 @@ def get_queryset(self):
 
         if self.request.query_params.get('is_checked'):
             is_null = self.request.query_params.get('is_checked') == 'true'
-            queryset = project.get_documents(is_null).distinct()
+            print(int(is_null))
+            queryset = project.get_documents(is_null=is_null, user=self.request.user.id).distinct()
 
         if (project.use_machine_model_sort):
             queryset = queryset.order_by('doc_mlm_annotations__prob').filter(project=self.kwargs['project_id']).exclude(doc_mlm_annotations__prob__isnull=True)
 
@@ -46,8 +46,13 @@ def create_kappa_comparison_df(labelers_df, filter_double_score=False):
     comparison_df = pd.DataFrame(index=col_list,columns=col_list)
     for name1 in col_list:
         for name2 in col_list:
-            set1 = labelers_df[name1].astype('str')
-            set2 = labelers_df[name2].astype('str')
+            if name1==name2:
+                comparison_df.loc[name1, name2] = 1
+                continue
+
+            temp_df = labelers_df[[name1, name2]].dropna(axis='index')
+            set1 = temp_df.loc[:,name1]
+            set2 = temp_df.loc[:,name2]
             score = cohen_kappa_score(set1, set2)
             comparison_df.loc[name1, name2] = score
             comparison_df.loc[name2, name1] = score
@@ -85,8 +90,10 @@ def calc_agreement(labelers_df, y):
     '''
     labeler_cols = [c for c in labelers_df.columns if c!=y]
     def calc_agreement_row(x):
-        values = x.loc[labeler_cols]
         true_y = x.loc[y]
+        if pd.isnull(true_y):
+            return None
+        values = [v for v in x.loc[labeler_cols] if pd.notnull(v)]
         return (values==true_y).mean()
     return labelers_df.apply(calc_agreement_row, axis=1)
 
@@ -96,7 +103,7 @@ def calc_entropy(labelers_df):
     :param labelers_df: a df in which each column is a labeler and each row a sample
     :return: a pd.Series of the entropy score of each samples
     '''
-    classes = np.unique(labelers_df)
+    classes = [v for v in np.unique(labelers_df) if pd.notnull(v)]
     return labelers_df.apply(lambda x: sp.stats.entropy([list(x).count(c) for c in classes]), axis=1)
 
 
@@ -107,9 +114,9 @@ def add_agreement_columns(labelers_df,y=None):
     :return: the labelers_df with 3 or 4 new columns
     '''
     df_copy = labelers_df.copy()
-    cols = df_copy.columns
     if y != None:
-        df_copy['true_agreement_prop'] = calc_agreement(df_copy[cols], y)
+        df_copy['true_agreement_prop'] = calc_agreement(df_copy, y)
+    cols = [c for c in df_copy.columns if isinstance(c, int)]
     df_copy['most_common'] = find_most_common_labeling(df_copy[cols])
     df_copy['most_common_agreement_prop'] = calc_agreement(df_copy[list(cols) + ['most_common']], 'most_common')
     df_copy['entropy'] = calc_entropy(df_copy[cols])
@@ -200,6 +207,16 @@ def train_labelers_based_model(labelers_df, y):
 
 
 if __name__ == '__main__':
-    df = pd.read_csv(r'C:\Users\omri.allouche\Downloads\labeler_agreement.csv')
-    pivot_table = df.pivot(index='document_id', columns='user_id', values='label_id')
-    create_kappa_comparison_df(pivot_table)
+    annotations_df = pd.read_csv(r'C:\Users\omri.allouche\Downloads\labeler_agreement.csv')
+    annotations_df = annotations_df.drop_duplicates(['document_id', 'user_id'])
+    annotations_df['is_correct'] = [int(x) for x in annotations_df['label_id'] == annotations_df['true_label_id']]
+    user_truth_agreement = annotations_df[pd.notnull(annotations_df['true_label_id'])].groupby('user_id')[
+        'is_correct'].agg(['count', 'mean'])
+
+    document_annotations_by_labeler = annotations_df.pivot(index='document_id', columns='user_id', values='label_id')
+    document_annotations_by_labeler = pd.merge(left=document_annotations_by_labeler,
+                                               right=annotations_df.set_index('document_id')[['true_label_id']],
+                                               left_index=True, right_index=True)
+    documents_agreement_df = add_agreement_columns(document_annotations_by_labeler, 'true_label_id')
+    users_agreement_kappa = create_kappa_comparison_df(document_annotations_by_labeler)
+    average_kappa_agreement_per_labeler = compute_average_agreement_per_labeler(users_agreement_kappa)
@@ -45,25 +45,6 @@ class Project(models.Model):
             'document_serializer': '',
             'annotations_serializer': '',
         },
-
-        # 'DocumentClassification': {
-        #     'title': '',
-        #     'type': '',
-        #     'image': '',
-        #     'template_html': '',
-        #     'document_serializer': '',
-        #     'annotations_serializer': '',
-        # },
-        #
-        # 'DocumentClassification': {
-        #     'title': '',
-        #     'type': '',
-        #     'image': '',
-        #     'template_html': '',
-        #     'document_serializer': '',
-        #     'annotations_serializer': '',
-        # },
-
     }
     DOCUMENT_CLASSIFICATION = 'DocumentClassification'
     SEQUENCE_LABELING = 'SequenceLabeling'
@@ -92,7 +73,8 @@ def get_absolute_url(self):
         return reverse('upload', args=[self.id])
 
     def is_type_of(self, project_type):
-        return project_type == self.project_type
+        # return project_type == self.project_type
+        return self.project_types[ self.project_type ]['type'] == project_type
 
     def get_progress(self, user):
         docs = self.get_documents(is_null=True, user=user)
@@ -103,26 +85,10 @@ def get_progress(self, user):
     @property
     def image(self):
         url = self.project_types[ self.project_type ]['image']
-        # if self.is_type_of(self.DOCUMENT_CLASSIFICATION):
-        #     url = staticfiles_storage.url('images/cat-1045782_640.jpg')
-        # elif self.is_type_of(self.SEQUENCE_LABELING):
-        #     url = staticfiles_storage.url('images/cat-3449999_640.jpg')
-        # elif self.is_type_of(self.Seq2seq):
-        #     url = staticfiles_storage.url('images/tiger-768574_640.jpg')
-
         return url
 
     def get_template_name(self):
         template_name = self.project_types[ self.project_type ]['template_html']
-        # if self.is_type_of(Project.DOCUMENT_CLASSIFICATION):
-        #     template_name = 'annotation/document_classification.html'
-        # elif self.is_type_of(Project.SEQUENCE_LABELING):
-        #     template_name = 'annotation/sequence_labeling.html'
-        # elif self.is_type_of(Project.Seq2seq):
-        #     template_name = 'annotation/seq2seq.html'
-        # else:
-        #     raise ValueError('Template does not exist')
-
         return template_name
 
     def get_mlm_user(self):
@@ -156,8 +122,8 @@ def get_documents(self, is_null=True, user=None):
         if self.is_type_of(Project.DOCUMENT_CLASSIFICATION):
             if user:
                 docs = docs.exclude(doc_annotations__user=user)
-            else:
-                docs = docs.filter(doc_annotations__isnull=is_null)
+            # else:
+                # docs = docs.filter(doc_annotations__isnull=is_null)
         elif self.is_type_of(Project.SEQUENCE_LABELING):
             if user:
                 docs = docs.exclude(seq_annotations__user=user)
@@ -169,6 +135,7 @@ def get_documents(self, is_null=True, user=None):
             else:
                 docs = docs.filter(seq2seq_annotations__isnull=is_null)
         else:
+            print('Project type: '+self.project_type)
             raise ValueError('Invalid project_type')
 
         return docs