11import click
22import logging
33import yaml
4- from typing import Dict , List , Optional
4+ from typing import Dict , List , Optional , Set , Any
55from collections import defaultdict
66import os
77import re
1414from deprecation import deprecated
1515from linkml_runtime import SchemaView
1616from linkml_runtime .linkml_model import SchemaDefinition , ClassDefinition , TypeDefinition , SlotDefinition
17+ from linkml_runtime .linkml_model .meta import UniqueKey
1718from quantulum3 import parser as q_parser
1819from dataclasses import dataclass , field
1920
@@ -63,17 +64,46 @@ class CsvDataGeneralizer(Generalizer):
6364 """
6465
6566 column_separator : str = "\t "
67+ """character that separates columns in the input file"""
68+
6669 schema_name : str = 'example'
70+ """LinkML schema name (no spaces)"""
71+
6772 robot : bool = False
73+ """If true, conforms to robot template format. Data dictionary rows start with '>'"""
74+
75+ data_dictionary_row_count : int = field (default = 0 )
76+ """number of rows after header containing data dictionary information"""
77+
6878 enum_columns : List [str ] = field (default_factory = lambda : [])
79+ """List of columns that are coerced into enums"""
80+
6981 enum_mask_columns : List [str ] = field (default_factory = lambda : [])
82+ """List of columns that are excluded from being enums"""
83+
7084 enum_threshold : float = 0.1
85+ """If number if distinct values divided by total number of values is greater than this, then the column is considered an enum"""
86+
7187 enum_strlen_threshold : int = 30
88+ """Maximimum length of a string to be considered a permissible enum value"""
89+
7290 max_enum_size : int = 50
91+ """Max number of permissible values for a column to be considered an enum"""
92+
7393 downcase_header : bool = False
94+ """If true, coerce column names to be lower case"""
95+
7496 infer_foreign_keys : bool = False
75- max_pk_len : int = 60 # URIs can be long..
97+ """For multi-CVS files, infer linkages between rows"""
98+
99+ max_pk_len : int = 60
100+ """Maximum length to be considered for a primary key column. Note: URIs can be long"""
101+
76102 min_distinct_fk_val : int = 8
103+ """For inferring foreign keys, there must be a minimum number."""
104+
105+ source_schema : Optional [SchemaDefinition ] = None
106+ """Optional base schema to draw from"""
77107
78108 def infer_linkages (self , files : List [str ], ** kwargs ) -> List [ForeignKey ]:
79109 """
@@ -297,14 +327,31 @@ def convert_dicts(self,
297327 rr : List [Dict ],
298328 schema_name : str = 'example' ,
299329 class_name : str = DEFAULT_CLASS_NAME ,
300- ** kwargs ) -> SchemaDefinition :
330+ ** kwargs ) -> Optional [SchemaDefinition ]:
331+ """
332+ Converts a list of row objects to a schema.
333+
334+ Each row is a data item, presumed to be of the same type,
335+ that is generalized.
336+
337+ :param rr:
338+ :param schema_name:
339+ :param class_name:
340+ :param kwargs:
341+ :return:
342+ """
301343 slots = {}
302- slot_values = {}
344+
345+ slot_distinct_values : Dict [str , Set [Any ]] = {}
346+ """distinct values for each slot"""
347+
348+ slot_values : Dict [str , List [Any ]] = defaultdict (list )
349+ """all values for each slot"""
350+
303351 n = 0
304352 enums = {}
305353 robot_defs = {}
306354 slot_usage = {}
307- types = {}
308355 enum_columns = self .enum_columns
309356 enum_mask_columns = self .enum_mask_columns
310357 if len (rr ) == 0 :
@@ -317,6 +364,14 @@ def convert_dicts(self,
317364 for k , v in row .items ():
318365 robot_defs [k ] = v
319366 continue
367+ if n <= self .data_dictionary_row_count :
368+ if self .source_schema is None :
369+ self .source_schema = SchemaDefinition (id = "auto" , name = "auto" )
370+ for k , v in row .items ():
371+ if k not in self .source_schema .slots :
372+ self .source_schema .slots [k ] = SlotDefinition (k )
373+ self .source_schema .slots [k ].description = v
374+ continue
320375 for k , v in row .items ():
321376 if k is None or k == '' :
322377 continue
@@ -332,22 +387,44 @@ def convert_dicts(self,
332387 vs = [v ]
333388 if k not in slots :
334389 slots [k ] = {'range' : None }
335- slot_values [k ] = set ()
390+ slot_distinct_values [k ] = set ()
336391 if v is not None and v != "" and not str (v ).startswith ('$ref:' ):
337392 slots [k ]['examples' ] = [{'value' : v }]
338- slot_values [k ].update (vs )
393+ slot_distinct_values [k ].update (vs )
394+ slot_values [k ] += vs
339395 if len (vs ) > 1 :
340396 slots [k ]['multivalued' ] = True
341397 types = {}
342398 new_slots = {}
399+ col_number = 0
400+ unique_keys = []
343401 for sn , s in slots .items ():
344- vals = slot_values [sn ]
402+ col_number += 1
403+ is_unique = len (set (slot_values [sn ])) == len (slot_values [sn ])
404+ is_pk = is_unique and col_number == 1
405+ if self .source_schema and sn in self .source_schema .slots and self .source_schema .slots [sn ].identifier :
406+ is_pk = True
407+ if is_pk :
408+ s ['identifier' ] = True
409+ elif is_unique :
410+ unique_keys .append (sn )
411+ vals = slot_distinct_values [sn ]
412+ if self .source_schema :
413+ if sn in self .source_schema .slots :
414+ s ['description' ] = self .source_schema .slots [sn ].description
345415 s ['range' ] = infer_range (s , vals , types )
416+ logging .info (f"Slot { sn } has range { s ['range' ]} " )
346417 if (s ['range' ] == 'string' or sn in enum_columns ) and sn not in enum_mask_columns :
418+ filtered_vals = \
419+ [v
420+ for v in slot_values [sn ]
421+ if not isinteger (v ) and not isfloat (v ) and not isboolean (v ) and not is_date (v )]
422+ n_filtered_vals = len (filtered_vals ) + 1
347423 n_distinct = len (vals )
348424 longest = max ([len (str (v )) for v in vals ]) if n_distinct > 0 else 0
425+ logging .info (f"Considering { sn } as enum: { n_distinct } distinct values / { n_filtered_vals } , longest={ longest } " )
349426 if sn in enum_columns or \
350- ((n_distinct / n ) < self .enum_threshold and 0 < n_distinct <= self .max_enum_size
427+ ((n_distinct / n_filtered_vals ) < self .enum_threshold and 0 < n_distinct <= self .max_enum_size
351428 and longest < self .enum_strlen_threshold ):
352429 enum_name = sn .replace (' ' , '_' ).replace ('(s)' , '' )
353430 enum_name = f'{ enum_name } _enum'
@@ -416,6 +493,9 @@ def convert_dicts(self,
416493 for sn , s in new_slots .items ():
417494 if sn not in slots :
418495 slots [sn ] = s
496+
497+ unique_keys = [UniqueKey (f"{ k } _key" ,
498+ unique_key_slots = [k ]) for k in unique_keys ]
419499 schema = SchemaDefinition (
420500 id = f'https://w3id.org/{ schema_name } ' ,
421501 name = schema_name ,
@@ -426,7 +506,9 @@ def convert_dicts(self,
426506 classes = [
427507 ClassDefinition (class_name ,
428508 slots = class_slots ,
429- slot_usage = slot_usage )
509+ slot_usage = slot_usage ,
510+ unique_keys = unique_keys ,
511+ )
430512 ],
431513 slots = slots ,
432514 enums = enums
@@ -465,6 +547,16 @@ def isfloat(value):
465547 except ValueError :
466548 return False
467549
550+ def isinteger (value ):
551+ try :
552+ int (value )
553+ return True
554+ except ValueError :
555+ return False
556+
557+ def isboolean (value ):
558+ return value in ['true' , 'false' ]
559+
468560
469561def is_measurement (value ):
470562 ms = q_parser .parse (value )
@@ -503,8 +595,18 @@ def is_all_measurement(values):
503595 return False
504596
505597
506- def infer_range (slot : dict , vals : set , types : dict ) -> str :
598+ def infer_range (slot : dict , vals : set , types : dict , coerce = True ) -> str :
599+ """
600+ Infers the range of a slot based on the values
601+
602+ :param slot:
603+ :param vals:
604+ :param types:
605+ :return:
606+ """
607+ logging .info (f"Inferring value for { list (vals )[0 :5 ]} ..." )
507608 nn_vals = [v for v in vals if v is not None and v != "" ]
609+ logging .info (f"FILTERED: { list (nn_vals )[0 :5 ]} ..." )
508610 if len (nn_vals ) == 0 :
509611 return 'string'
510612 if all (str (v ).startswith ('$ref:' ) for v in nn_vals ):
@@ -513,12 +615,15 @@ def infer_range(slot: dict, vals: set, types: dict) -> str:
513615 return 'integer'
514616 if all (isinstance (v , float ) for v in nn_vals ):
515617 return 'float'
516- if all (str (v ).isdigit () for v in nn_vals ):
517- return 'integer'
518- if all (is_date (v ) for v in nn_vals ):
519- return 'datetime'
520- if all (isfloat (v ) for v in nn_vals ):
521- return 'float'
618+ if coerce :
619+ if all (isinteger (v ) for v in nn_vals ):
620+ return 'integer'
621+ if all (isboolean (v ) for v in nn_vals ):
622+ return 'boolean'
623+ if all (isfloat (v ) for v in nn_vals ):
624+ return 'float'
625+ if all (is_date (v ) for v in nn_vals ):
626+ return 'datetime'
522627 if is_all_measurement (nn_vals ):
523628 return 'measurement'
524629 v0 = nn_vals [0 ]
@@ -535,12 +640,17 @@ def infer_range(slot: dict, vals: set, types: dict) -> str:
535640 return 'string'
536641
537642
538- def get_db (db_id : str ) -> str :
539- parts = db_id .split (':' )
540- if len (parts ) > 1 :
541- return parts [0 ]
542- else :
543- return None
643+ def get_db (db_id : str ) -> Optional [str ]:
644+ """
645+ Extracts the database from a CURIE
646+
647+ :param db_id:
648+ :return:
649+ """
650+ if isinstance (db_id , str ) and ':' in db_id :
651+ parts = db_id .split (':' )
652+ if len (parts ) > 1 :
653+ return parts [0 ]
544654
545655
546656def is_date (string , fuzzy = False ):
0 commit comments