1313from linkml_runtime .linkml_model import SchemaDefinition
1414from oaklib .selector import get_resource_from_shorthand , get_implementation_from_shorthand
1515
16- from schema_automator import JsonLdAnnotator
16+ from schema_automator import JsonLdAnnotator , FrictionlessImportEngine
1717from schema_automator .annotators .schema_annotator import SchemaAnnotator
1818from schema_automator .generalizers .csv_data_generalizer import CsvDataGeneralizer
1919from schema_automator .generalizers .generalizer import DEFAULT_CLASS_NAME , DEFAULT_SCHEMA_NAME
4444 default = DEFAULT_SCHEMA_NAME ,
4545 show_default = True ,
4646 help = 'Schema name' )
47+ schema_id_option = click .option (
48+ '--schema-id' ,
49+ help = 'Schema id' )
4750annotator_option = click .option (
4851 '--annotator' ,
4952 '-A' ,
5255 "--use-attributes/--no-use-attributes" ,
5356 help = "If true, use attributes over slots/slot_usage"
5457)
58+ column_separator_option = click .option ('--column-separator' , '-s' , default = '\t ' , help = 'separator' )
59+
60+ # generalizer options
61+
62+ downcase_header_option = click .option ('--downcase-header/--no-downcase-header' , default = False , help = 'if true make headers lowercase' )
63+ snakecase_header_option = click .option ('--snakecase-header/--no-snakecase-header' , default = False , help = 'if true make headers snakecase' )
64+ infer_foreign_keys_option = click .option ('--infer-foreign-keys/--no-infer-foreign-keys' , default = False , help = 'infer ranges/foreign keys' )
65+ enum_columns_option = click .option ('--enum-columns' , '-E' , multiple = True , help = 'column(s) that is forced to be an enum' )
66+ enum_mask_columns_option = click .option ('--enum-mask-columns' , multiple = True , help = 'column(s) that are excluded from being enums' )
67+ max_enum_size_option = click .option ('--max-enum-size' , default = 50 , help = 'do not create an enum if more than max distinct members' )
68+ enum_threshold_option = click .option ('--enum-threshold' , default = 0.1 , help = 'if the number of distinct values / rows is less than this, do not make an enum' )
69+
5570
5671@click .group ()
5772@click .option ("-v" , "--verbose" ,
@@ -89,13 +104,12 @@ def main(verbose: int, quiet: bool):
89104@schema_name_option
90105@annotator_option
91106@click .option ('--class-name' , '-c' , default = DEFAULT_CLASS_NAME , help = 'Core class name in schema' )
92- @click .option ('--column-separator' , '-s' , default = '\t ' , help = 'separator' )
93- @click .option ('--downcase-header/--no-downcase-header' , default = False , help = 'if true make headers lowercase' )
94- @click .option ('--enum-columns' , '-E' , multiple = True , help = 'column that is forced to be an enum' )
95- @click .option ('--enum-threshold' , type = click .FLOAT , help = 'set high to be more inclusive' )
96- @click .option ('--max-enum-size' ,
97- type = click .INT ,
98- help = 'set high to be more inclusive' )
107+ @column_separator_option
108+ @downcase_header_option
109+ @snakecase_header_option
110+ @enum_columns_option
111+ @enum_threshold_option
112+ @max_enum_size_option
99113@click .option ('--data-dictionary-row-count' ,
100114 type = click .INT ,
101115 help = 'rows that provide metadata about columns' )
@@ -128,13 +142,12 @@ def generalize_tsv(tsvfile, output, class_name, schema_name, pandera: bool, anno
128142@click .argument ('tsvfiles' , nargs = - 1 ) # input TSV (must have column headers
129143@output_option
130144@schema_name_option
131- @click .option ('--column-separator' , '-s' , default = '\t ' , help = 'separator' )
132- @click .option ('--downcase-header/--no-downcase-header' , default = False , help = 'if true make headers lowercase' )
133- @click .option ('--infer-foreign-keys/--no-infer-foreign-keys' , default = False , help = 'infer ranges/foreign keys' )
134- @click .option ('--enum-columns' , '-E' , multiple = True , help = 'column(s) that is forced to be an enum' )
135- @click .option ('--enum-mask-columns' , multiple = True , help = 'column(s) that are excluded from being enums' )
136- @click .option ('--max-enum-size' , default = 50 , help = 'do not create an enum if more than max distinct members' )
137- @click .option ('--enum-threshold' , default = 0.1 , help = 'if the number of distinct values / rows is less than this, do not make an enum' )
145+ @column_separator_option
146+ @downcase_header_option
147+ @snakecase_header_option
148+ @enum_columns_option
149+ @enum_threshold_option
150+ @max_enum_size_option
138151@click .option ('--robot/--no-robot' , default = False , help = 'set if the TSV is a ROBOT template' )
139152def generalize_tsvs (tsvfiles , output , schema_name , ** kwargs ):
140153 """
@@ -157,6 +170,12 @@ def generalize_tsvs(tsvfiles, output, schema_name, **kwargs):
157170@click .argument ('url' ) # input TSV (must have column headers
158171@output_option
159172@schema_name_option
173+ @column_separator_option
174+ @downcase_header_option
175+ @snakecase_header_option
176+ @enum_columns_option
177+ @enum_threshold_option
178+ @max_enum_size_option
160179@click .option ('--class-name' , '-c' , default = DEFAULT_CLASS_NAME , help = 'Core class name in schema' )
161180@click .option ('--pandera/--no-pandera' , default = False , help = 'set to use panderas as inference engine' )
162181@click .option ('--data-output' , help = 'Path to file of downloaded data' )
@@ -179,8 +198,13 @@ def generalize_htmltable(url, output, class_name, schema_name, pandera: bool,
179198 dfs = pd .read_html (url )
180199 logging .info (f"{ url } has { len (dfs )} tables" )
181200 df = dfs [table_number ]
182- importer = TableImportEngine (** kwargs )
183- schema = importer .import_from_dataframe (df )
201+ if data_output :
202+ df .to_csv (data_output , index = False , sep = "\t " )
203+ if pandera :
204+ ge = PandasDataGeneralizer (** kwargs )
205+ else :
206+ ge = CsvDataGeneralizer (** kwargs )
207+ schema = ge .convert_from_dataframe (df , class_name = class_name , schema_name = schema_name )
184208 write_schema (schema , output )
185209
186210
@@ -241,13 +265,15 @@ def import_htmltable(url, output, class_name, schema_name, columns,
241265 table_number : int , data_output ,
242266 ** kwargs ):
243267 """
244- Generalizes from a table parsed from a URL
268+ Imports from a table parsed from a URL using SchemaSheets
245269
246270 Uses pandas/beautiful soup
247271 """
248272 dfs = pd .read_html (url )
249273 logging .info (f"{ url } has { len (dfs )} tables" )
250274 df = dfs [table_number ]
275+ if data_output :
276+ df .to_csv (data_output , index = False , sep = "\t " )
251277 ie = TableImportEngine (columns = columns .split ("," ), ** kwargs )
252278 schema = ie .import_from_dataframe (df )
253279 write_schema (schema , output )
@@ -339,6 +365,26 @@ def import_json_schema(input, output, import_project: bool, schema_name, format,
339365 ie .import_project (input , output , name = schema_name , format = format )
340366
341367
368+ @main .command ()
369+ @click .argument ('input' )
370+ @output_option
371+ @schema_name_option
372+ @schema_id_option
373+ def import_frictionless (input , output , schema_name , schema_id , ** kwargs ):
374+ """
375+ Imports from Frictionless data package to LinkML
376+
377+ See :ref:`importers` for more on the importer framework
378+
379+ Example:
380+
381+ schemauto import-frictionless cfde.package.json
382+ """
383+ ie = FrictionlessImportEngine (** kwargs )
384+ schema = ie .convert (input , name = schema_name , id = schema_id )
385+ write_schema (schema , output )
386+
387+
342388@main .command ()
343389@click .argument ('owlfile' )
344390@output_option
@@ -428,7 +474,7 @@ def generalize_rdf(rdffile, dir, output, **args):
428474@output_option
429475def annotate_schema (schema : str , input : str , output : str , ** kwargs ):
430476 """
431- Annotate all elements of a schema
477+ Annotate all elements of a schema.
432478
433479 This uses OAK (https://incatools.github.io/ontology-access-kit),
434480 and you can provide any OAK backend that supports text annotation.
@@ -471,6 +517,10 @@ def enrich_schema(schema: str, input: str, output: str, annotate: bool, **args):
471517 """
472518 Enrich a schema using an ontology.
473519
520+ Here, "enrich" means copying over metadata from the ontology to the schema.
521+ For example, if the schema has a class "Gene" that is mapped to a SO class for "gene",
522+ then calling this command will copy the SO class definition to the schema class.
523+
474524 This will use OAK to add additional metadata using uris and mappings in the schema.
475525
476526 For example, if your schema has a class with a mapping to a SO class,
0 commit comments