Skip to content

Commit 1bb3268

Browse files
authored
Adding CLI option from import-from-frictionless. (#118)
* Adding CLI option from import-from-frictionless. Removed old CLI options * update * updating to latest schemabuilder
1 parent 5347c8e commit 1bb3268

File tree

11 files changed

+2962
-754
lines changed

11 files changed

+2962
-754
lines changed

notebooks/images/FRED.png

1.09 MB
Loading

poetry.lock

Lines changed: 2795 additions & 656 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ packages = [
1212

1313
[tool.poetry.dependencies]
1414
python = "^3.9"
15-
linkml = "^1.3.5"
15+
linkml = ">=1.5.4"
1616
mkdocs = "^1.2.3"
1717
pandas = "^1.3.5"
1818
python-dateutil = "^2.8.2"

schema_automator/cli.py

Lines changed: 69 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from linkml_runtime.linkml_model import SchemaDefinition
1414
from oaklib.selector import get_resource_from_shorthand, get_implementation_from_shorthand
1515

16-
from schema_automator import JsonLdAnnotator
16+
from schema_automator import JsonLdAnnotator, FrictionlessImportEngine
1717
from schema_automator.annotators.schema_annotator import SchemaAnnotator
1818
from schema_automator.generalizers.csv_data_generalizer import CsvDataGeneralizer
1919
from schema_automator.generalizers.generalizer import DEFAULT_CLASS_NAME, DEFAULT_SCHEMA_NAME
@@ -44,6 +44,9 @@
4444
default=DEFAULT_SCHEMA_NAME,
4545
show_default=True,
4646
help='Schema name')
47+
schema_id_option = click.option(
48+
'--schema-id',
49+
help='Schema id')
4750
annotator_option = click.option(
4851
'--annotator',
4952
'-A',
@@ -52,6 +55,18 @@
5255
"--use-attributes/--no-use-attributes",
5356
help="If true, use attributes over slots/slot_usage"
5457
)
58+
column_separator_option = click.option('--column-separator', '-s', default='\t', help='separator')
59+
60+
# generalizer options
61+
62+
downcase_header_option = click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
63+
snakecase_header_option = click.option('--snakecase-header/--no-snakecase-header', default=False, help='if true make headers snakecase')
64+
infer_foreign_keys_option = click.option('--infer-foreign-keys/--no-infer-foreign-keys', default=False, help='infer ranges/foreign keys')
65+
enum_columns_option = click.option('--enum-columns', '-E', multiple=True, help='column(s) that is forced to be an enum')
66+
enum_mask_columns_option = click.option('--enum-mask-columns', multiple=True, help='column(s) that are excluded from being enums')
67+
max_enum_size_option = click.option('--max-enum-size', default=50, help='do not create an enum if more than max distinct members')
68+
enum_threshold_option = click.option('--enum-threshold', default=0.1, help='if the number of distinct values / rows is less than this, do not make an enum')
69+
5570

5671
@click.group()
5772
@click.option("-v", "--verbose",
@@ -89,13 +104,12 @@ def main(verbose: int, quiet: bool):
89104
@schema_name_option
90105
@annotator_option
91106
@click.option('--class-name', '-c', default=DEFAULT_CLASS_NAME, help='Core class name in schema')
92-
@click.option('--column-separator', '-s', default='\t', help='separator')
93-
@click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
94-
@click.option('--enum-columns', '-E', multiple=True, help='column that is forced to be an enum')
95-
@click.option('--enum-threshold', type=click.FLOAT, help='set high to be more inclusive')
96-
@click.option('--max-enum-size',
97-
type=click.INT,
98-
help='set high to be more inclusive')
107+
@column_separator_option
108+
@downcase_header_option
109+
@snakecase_header_option
110+
@enum_columns_option
111+
@enum_threshold_option
112+
@max_enum_size_option
99113
@click.option('--data-dictionary-row-count',
100114
type=click.INT,
101115
help='rows that provide metadata about columns')
@@ -128,13 +142,12 @@ def generalize_tsv(tsvfile, output, class_name, schema_name, pandera: bool, anno
128142
@click.argument('tsvfiles', nargs=-1) # input TSV (must have column headers
129143
@output_option
130144
@schema_name_option
131-
@click.option('--column-separator', '-s', default='\t', help='separator')
132-
@click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
133-
@click.option('--infer-foreign-keys/--no-infer-foreign-keys', default=False, help='infer ranges/foreign keys')
134-
@click.option('--enum-columns', '-E', multiple=True, help='column(s) that is forced to be an enum')
135-
@click.option('--enum-mask-columns', multiple=True, help='column(s) that are excluded from being enums')
136-
@click.option('--max-enum-size', default=50, help='do not create an enum if more than max distinct members')
137-
@click.option('--enum-threshold', default=0.1, help='if the number of distinct values / rows is less than this, do not make an enum')
145+
@column_separator_option
146+
@downcase_header_option
147+
@snakecase_header_option
148+
@enum_columns_option
149+
@enum_threshold_option
150+
@max_enum_size_option
138151
@click.option('--robot/--no-robot', default=False, help='set if the TSV is a ROBOT template')
139152
def generalize_tsvs(tsvfiles, output, schema_name, **kwargs):
140153
"""
@@ -157,6 +170,12 @@ def generalize_tsvs(tsvfiles, output, schema_name, **kwargs):
157170
@click.argument('url') # input TSV (must have column headers
158171
@output_option
159172
@schema_name_option
173+
@column_separator_option
174+
@downcase_header_option
175+
@snakecase_header_option
176+
@enum_columns_option
177+
@enum_threshold_option
178+
@max_enum_size_option
160179
@click.option('--class-name', '-c', default=DEFAULT_CLASS_NAME, help='Core class name in schema')
161180
@click.option('--pandera/--no-pandera', default=False, help='set to use panderas as inference engine')
162181
@click.option('--data-output', help='Path to file of downloaded data')
@@ -179,8 +198,13 @@ def generalize_htmltable(url, output, class_name, schema_name, pandera: bool,
179198
dfs = pd.read_html(url)
180199
logging.info(f"{url} has {len(dfs)} tables")
181200
df = dfs[table_number]
182-
importer = TableImportEngine(**kwargs)
183-
schema = importer.import_from_dataframe(df)
201+
if data_output:
202+
df.to_csv(data_output, index=False, sep="\t")
203+
if pandera:
204+
ge = PandasDataGeneralizer(**kwargs)
205+
else:
206+
ge = CsvDataGeneralizer(**kwargs)
207+
schema = ge.convert_from_dataframe(df, class_name=class_name, schema_name=schema_name)
184208
write_schema(schema, output)
185209

186210

@@ -241,13 +265,15 @@ def import_htmltable(url, output, class_name, schema_name, columns,
241265
table_number: int, data_output,
242266
**kwargs):
243267
"""
244-
Generalizes from a table parsed from a URL
268+
Imports from a table parsed from a URL using SchemaSheets
245269
246270
Uses pandas/beautiful soup
247271
"""
248272
dfs = pd.read_html(url)
249273
logging.info(f"{url} has {len(dfs)} tables")
250274
df = dfs[table_number]
275+
if data_output:
276+
df.to_csv(data_output, index=False, sep="\t")
251277
ie = TableImportEngine(columns=columns.split(","), **kwargs)
252278
schema = ie.import_from_dataframe(df)
253279
write_schema(schema, output)
@@ -339,6 +365,26 @@ def import_json_schema(input, output, import_project: bool, schema_name, format,
339365
ie.import_project(input, output, name=schema_name, format=format)
340366

341367

368+
@main.command()
369+
@click.argument('input')
370+
@output_option
371+
@schema_name_option
372+
@schema_id_option
373+
def import_frictionless(input, output, schema_name, schema_id, **kwargs):
374+
"""
375+
Imports from Frictionless data package to LinkML
376+
377+
See :ref:`importers` for more on the importer framework
378+
379+
Example:
380+
381+
schemauto import-frictionless cfde.package.json
382+
"""
383+
ie = FrictionlessImportEngine(**kwargs)
384+
schema = ie.convert(input, name=schema_name, id=schema_id)
385+
write_schema(schema, output)
386+
387+
342388
@main.command()
343389
@click.argument('owlfile')
344390
@output_option
@@ -428,7 +474,7 @@ def generalize_rdf(rdffile, dir, output, **args):
428474
@output_option
429475
def annotate_schema(schema: str, input: str, output: str, **kwargs):
430476
"""
431-
Annotate all elements of a schema
477+
Annotate all elements of a schema.
432478
433479
This uses OAK (https://incatools.github.io/ontology-access-kit),
434480
and you can provide any OAK backend that supports text annotation.
@@ -471,6 +517,10 @@ def enrich_schema(schema: str, input: str, output: str, annotate: bool, **args):
471517
"""
472518
Enrich a schema using an ontology.
473519
520+
Here, "enrich" means copying over metadata from the ontology to the schema.
521+
For example, if the schema has a class "Gene" that is mapped to a SO class for "gene",
522+
then calling this command will copy the SO class definition to the schema class.
523+
474524
This will use OAK to add additional metadata using uris and mappings in the schema.
475525
476526
For example, if your schema has a class with a mapping to a SO class,

schema_automator/generalizers/csv_data_generalizer.py

Lines changed: 22 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from linkml_runtime import SchemaView
1616
from linkml_runtime.linkml_model import SchemaDefinition, ClassDefinition, TypeDefinition, SlotDefinition
1717
from linkml_runtime.linkml_model.meta import UniqueKey
18+
from linkml_runtime.utils.formatutils import underscore
1819
from quantulum3 import parser as q_parser
1920
from dataclasses import dataclass, field
2021

@@ -93,6 +94,9 @@ class CsvDataGeneralizer(Generalizer):
9394
downcase_header: bool = False
9495
"""If true, coerce column names to be lower case"""
9596

97+
snakecase_header: bool = False
98+
"""If true, coerce column names to be snake case"""
99+
96100
infer_foreign_keys: bool = False
97101
"""For multi-CVS files, infer linkages between rows"""
98102

@@ -127,10 +131,14 @@ def infer_linkages(self, files: List[str], **kwargs) -> List[ForeignKey]:
127131
c = os.path.splitext(os.path.basename(file))[0]
128132
if self.downcase_header:
129133
c = c.lower()
134+
if self.snakecase_header:
135+
c = underscore(c)
130136
logging.info(f'READING {file} ')
131137
df = pd.read_csv(file, sep=self.column_separator, skipinitialspace=True).fillna("")
132138
if self.downcase_header:
133139
df = df.rename(columns=str.lower)
140+
if self.snakecase_header:
141+
df = df.rename(columns=underscore)
134142
exclude = []
135143
for col in df.columns:
136144
vals = set(df[col].tolist())
@@ -242,6 +250,8 @@ def convert_multiple(self, files: List[str], **kwargs) -> SchemaDefinition:
242250
c = os.path.splitext(os.path.basename(file))[0]
243251
if self.downcase_header:
244252
c = c.lower()
253+
if self.snakecase_header:
254+
c = underscore(c)
245255
s = self.convert(file, class_name=c, **kwargs)
246256
if s is not None:
247257
schemas.append(s)
@@ -267,6 +277,16 @@ def convert(self, file: str, **kwargs) -> SchemaDefinition:
267277
rr = csv.DictReader(tsv_file, fieldnames=header, delimiter=self.column_separator, skipinitialspace=False)
268278
return self.convert_dicts([r for r in rr], **kwargs)
269279

280+
def convert_from_dataframe(self, df: pd.DataFrame, **kwargs) -> SchemaDefinition:
281+
"""
282+
Converts a single dataframe to a single-class schema
283+
284+
:param df:
285+
:param kwargs:
286+
:return:
287+
"""
288+
return self.convert_dicts(df.to_dict('records'), **kwargs)
289+
270290
def read_slot_tsv(self, file: str, **kwargs) -> Dict:
271291
with open(file, newline='') as tsv_file:
272292
rows_list = csv.reader(tsv_file, delimiter=self.column_separator)
@@ -359,6 +379,8 @@ def convert_dicts(self,
359379
for row in rr:
360380
if self.downcase_header:
361381
row = {k.lower(): v for k, v in row.items()}
382+
if self.snakecase_header:
383+
row = {underscore(k): v for k, v in row.items()}
362384
n += 1
363385
if n == 1 and self.robot:
364386
for k, v in row.items():
@@ -784,60 +806,5 @@ def add_missing_to_schema(schema: SchemaDefinition):
784806
description='Holds a measurement serialized as a string')
785807

786808

787-
@click.group()
788-
def main():
789-
pass
790-
791-
792-
@main.command()
793-
@click.argument('tsvfile') # input TSV (must have column headers
794-
@click.option('--output', '-o', help='Output file')
795-
@click.option('--class_name', '-c', default='example', help='Core class name in schema')
796-
@click.option('--schema_name', '-n', default='example', help='Schema name')
797-
@click.option('--separator', '-s', default='\t', help='separator')
798-
@click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
799-
@click.option('--enum-columns', '-E', multiple=True, help='column that is forced to be an enum')
800-
@click.option('--robot/--no-robot', default=False, help='set if the TSV is a ROBOT template')
801-
def tsv2model(tsvfile, output, separator, class_name, schema_name, **kwargs):
802-
""" Infer a model from a TSV """
803-
ie = CsvDataGeneralizer(**kwargs)
804-
schema = ie.convert(tsvfile, class_name=class_name, schema_name=schema_name)
805-
write_schema(schema, output)
806-
807-
808-
@main.command()
809-
@click.argument('tsvfiles', nargs=-1) # input TSV (must have column headers
810-
@click.option('--output', '-o', help='Output file')
811-
@click.option('--schema_name', '-n', default='example', help='Schema name')
812-
@click.option('--file_separator', '-s', default='\t', help='separator')
813-
@click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
814-
@click.option('--infer-foreign-keys/--no-infer-foreign-keys', default=False, help='infer ranges/foreign keys')
815-
@click.option('--enum-columns', '-E', multiple=True, help='column(s) that is forced to be an enum')
816-
@click.option('--enum-mask-columns', multiple=True, help='column(s) that are excluded from being enums')
817-
@click.option('--max-enum-size', default=50, help='do not create an enum if more than max distinct members')
818-
@click.option('--enum-threshold', default=0.1, help='if the number of distinct values / rows is less than this, do not make an enum')
819-
@click.option('--robot/--no-robot', default=False, help='set if the TSV is a ROBOT template')
820-
def tsvs2model(tsvfiles, output, schema_name, **kwargs):
821-
""" Infer a model from multiple TSVs """
822-
ie = CsvDataGeneralizer(**kwargs)
823-
schema = ie.convert_multiple(tsvfiles, schema_name=schema_name)
824-
write_schema(schema, output)
825-
826-
827-
@main.command()
828-
@click.argument('yamlfile')
829-
@click.option('--zooma-confidence', '-Z', help='zooma confidence')
830-
@click.option('--results', '-r', help='mapping results file')
831-
def enrich(yamlfile, results, **args):
832-
""" Infer a model from a TSV """
833-
yamlobj = yaml.load(open(yamlfile))
834-
cache = {}
835-
infer_enum_meanings(yamlobj, cache=cache)
836-
if results is not None:
837-
with open(results, "w") as io:
838-
io.write(yaml.dump(cache))
839-
print(yaml.dump(yamlobj, default_flow_style=False, sort_keys=False))
840-
841-
842809
if __name__ == '__main__':
843810
main()

schema_automator/importers/frictionless_import_engine.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class FrictionlessImportEngine(ImportEngine):
4646
4747
"""
4848

49-
def convert(self, file: str, id: str,name: str, **kwargs) -> SchemaDefinition:
49+
def convert(self, file: str, id: str=None, name: str=None, **kwargs) -> SchemaDefinition:
5050
"""
5151
Converts one or more JSON files into a Schema
5252
@@ -59,8 +59,6 @@ def convert(self, file: str, id: str,name: str, **kwargs) -> SchemaDefinition:
5959
schema = sb.schema
6060
if id:
6161
schema.id = id
62-
if name:
63-
sb.add_prefix(name, f"{id}/")
6462
if not name:
6563
name = package.name
6664
if name:
@@ -128,7 +126,7 @@ def add_enum(self, sb: SchemaBuilder, field: fl.Field) -> EnumDefinition:
128126
if len(toks) == 2:
129127
[prefix, short] = toks
130128
pv = PermissibleValue(short, meaning=code)
131-
sb.add_prefix(prefix, f"{sb.schema.id}/{prefix}/")
129+
sb.add_prefix(prefix, f"{sb.schema.id}/{prefix}/", replace_if_present=True)
132130
e.permissible_values[pv.text] = pv
133131
if e.name is sb.schema:
134132
raise NotImplementedError(f"Cannot yet merge enums")

schema_automator/importers/rdfs_import_engine.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,13 +104,14 @@ def convert(
104104
if name is None:
105105
name = "example"
106106
sb = SchemaBuilder(name=name)
107+
sb.add_defaults()
107108
schema = sb.schema
108109
for k, v in g.namespaces():
109-
sb.add_prefix(k, v)
110+
sb.add_prefix(k, v, replace_if_present=True)
110111
if default_prefix is not None:
111112
schema.default_prefix = default_prefix
112113
if default_prefix not in schema.prefixes:
113-
sb.add_prefix(default_prefix, model_uri)
114+
sb.add_prefix(default_prefix, model_uri, replace_if_present=True)
114115
schema.id = schema.prefixes[default_prefix].prefix_reference
115116
cls_slots = defaultdict(list)
116117
props = []
@@ -155,7 +156,6 @@ def convert(
155156
c.slots = cls_slots.get(cn, [])
156157
c.class_uri = str(s.n3(g.namespace_manager))
157158
sb.add_class(c)
158-
sb.add_defaults()
159159
if identifier is not None:
160160
id_slot = SlotDefinition(identifier, identifier=True, range="uriorcurie")
161161
schema.slots[identifier] = id_slot

schema_automator/importers/tabular_import_engine.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,16 @@ def import_from_dataframe(self, df: pd.DataFrame):
4040
:return:
4141
"""
4242
tf = NamedTemporaryFile(delete=False)
43+
if not self.columns:
44+
raise ValueError("Must specify columns")
45+
logging.info(f"Using columns: {self.columns}")
4346
ix = 1
4447
line = pd.DataFrame(dict(zip(df.head(), self.columns)), index=[ix])
4548
df = pd.concat([df.iloc[:ix-1], line, df.iloc[ix-1:]]).reset_index(drop=True)
4649
if self.parent:
4750
df.insert(0,
4851
column="parent",
4952
value=[f">{self.element_type}"] + [self.parent] * (len(df) - 1))
50-
#print(df)
5153
df.to_csv(tf.name, sep='\t', index=False)
5254
#print(open(tf.name, 'r').read())
5355
#element_map = dict(zip(df.head(), self.columns))

0 commit comments

Comments
 (0)