Skip to content

Commit b96e566

Browse files
authored
CsvGeneralizer: Adding ability to interpret metadata rows (#106)
- adding docs to parameters - expose more parameters in CLI - additional tests
1 parent 455c91a commit b96e566

File tree

7 files changed

+632
-2139
lines changed

7 files changed

+632
-2139
lines changed

poetry.lock

Lines changed: 384 additions & 2108 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ psycopg2-binary = "^2.9.2"
2424
strsimpy = "^0.2.1"
2525
requests = "^2.26.0"
2626
bioregistry = "^0.5.87"
27-
oaklib = "^0.1.43"
27+
oaklib = "^0.1.52"
2828
pandera = "^0.12.0"
2929
tomlkit = "^0.11.4"
3030
inflect = "^6.0.0"

schema_automator/annotators/schema_annotator.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ def annotate_element(self, elt: Union[PermissibleValue, Element]) -> None:
5555
if self.mine_descriptions and elt.description:
5656
texts.append(elt.description)
5757
for text in texts:
58+
logging.info(f"Annotating: {text}")
5859
for r in self.annotate_text(text):
5960
logging.debug(f'MATCH: {r}')
6061
if self.allow_partial or r.matches_whole_text:
@@ -91,13 +92,15 @@ def annotate_text(self, text: str) -> Iterator[TextAnnotation]:
9192
oi = self.ontology_implementation
9293
text_exp = uncamel(text) # TODO: use main linkml_runtime method
9394
if isinstance(oi, TextAnnotatorInterface):
95+
logging.debug(f"Using TextAnnotatorInterface on {text_exp}")
9496
# TextAnnotation is available; use this by default
9597
for r in oi.annotate_text(text_exp):
9698
yield r
9799
if text_exp != text.lower():
98100
for r in oi.annotate_text(text_exp):
99101
yield r
100102
elif isinstance(oi, SearchInterface):
103+
logging.debug(f"Using SearchInterface on {text_exp}")
101104
# use search as an alternative
102105
cfg = SearchConfiguration(is_complete=True)
103106
for r in oi.basic_search(text, config=cfg):
@@ -128,7 +131,15 @@ def enrich(self, schema: Union[SchemaDefinition, str]) -> SchemaDefinition:
128131
Enrich a schema by performing lookups on the external ontology/vocabulary endpoint,
129132
and copying over metadata
130133
131-
Currently the only metadata obtained is text definitions
134+
Currently, the only metadata obtained is text definitions
135+
136+
.. code-block:: python
137+
138+
>>> from schema_automator.annotators.schema_annotator import SchemaAnnotator
139+
>>> from oaklib.selector import get_implementation_from_shorthand
140+
>>> oi = get_implementation_from_shorthand("sqlite:obo:so")
141+
>>> sa = SchemaAnnotator(ontology_implementation=oi)
142+
>>> schema = sa.enrich("tests/data/schema.yaml")
132143
133144
:param schema:
134145
:return:
@@ -160,7 +171,7 @@ def _add_description_from_curies(self, elt: Union[Element, PermissibleValue], cu
160171
if elt.description:
161172
break
162173
try:
163-
defn = oi.get_definition_by_curie(x)
174+
defn = oi.definition(x)
164175
if defn:
165176
elt.description = defn
166177
else:

schema_automator/cli.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,13 @@ def main(verbose: int, quiet: bool):
8787
@click.option('--column-separator', '-s', default='\t', help='separator')
8888
@click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
8989
@click.option('--enum-columns', '-E', multiple=True, help='column that is forced to be an enum')
90+
@click.option('--enum-threshold', type=click.FLOAT, help='set high to be more inclusive')
91+
@click.option('--max-enum-size',
92+
type=click.INT,
93+
help='set high to be more inclusive')
94+
@click.option('--data-dictionary-row-count',
95+
type=click.INT,
96+
help='rows that provide metadata about columns')
9097
@click.option('--robot/--no-robot', default=False, help='set if the TSV is a ROBOT template')
9198
@click.option('--pandera/--no-pandera', default=False, help='set to use panderas as inference engine')
9299
def generalize_tsv(tsvfile, output, class_name, schema_name, pandera: bool, annotator, **kwargs):
@@ -99,6 +106,7 @@ def generalize_tsv(tsvfile, output, class_name, schema_name, pandera: bool, anno
99106
100107
schemauto generalize-tsv --class-name Person --schema-name PersonInfo my/data/persons.tsv
101108
"""
109+
kwargs = {k:v for k, v in kwargs.items() if v is not None}
102110
if pandera:
103111
ie = PandasDataGeneralizer(**kwargs)
104112
else:
@@ -387,16 +395,32 @@ def annotate_schema(schema: str, input: str, output: str, **kwargs):
387395
@main.command()
388396
@click.argument('schema')
389397
@click.option('--input', '-i', help="OAK input ontology selector")
398+
@click.option('--annotate/--no-annotate', default=True, help="If true, annotate the schema")
390399
@output_option
391-
def enrich_schema(schema: str, input: str, output: str, **args):
400+
def enrich_schema(schema: str, input: str, output: str, annotate: bool, **args):
392401
"""
393-
Annotate all elements of a schema
402+
Enrich a schema using an ontology.
403+
404+
This will use OAK to add additional metadata using uris and mappings in the schema.
405+
406+
For example, if your schema has a class with a mapping to a SO class,
407+
then the definition of that will be copied to the class description.
408+
409+
Example:
410+
411+
schemauto enrich-schema -i bioportal: my-schema.yaml -o my-enriched.yaml
412+
413+
If your schema has no mappings you can use --annotate to add them
414+
415+
Example:
394416
395-
Requires Bioportal API key
417+
schemauto enrich-schema -i so.obo --annotate my-schema.yaml -o my-enriched.yaml --annotate
396418
"""
397419
impl = get_implementation_from_shorthand(input)
398420
annr = SchemaAnnotator(impl)
399421
logging.info(f"Enriching: {schema}")
422+
if annotate:
423+
schema = annr.annotate_schema(schema)
400424
schema = annr.enrich(schema)
401425
write_schema(schema, output)
402426

schema_automator/generalizers/csv_data_generalizer.py

Lines changed: 133 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import click
22
import logging
33
import yaml
4-
from typing import Dict, List, Optional
4+
from typing import Dict, List, Optional, Set, Any
55
from collections import defaultdict
66
import os
77
import re
@@ -14,6 +14,7 @@
1414
from deprecation import deprecated
1515
from linkml_runtime import SchemaView
1616
from linkml_runtime.linkml_model import SchemaDefinition, ClassDefinition, TypeDefinition, SlotDefinition
17+
from linkml_runtime.linkml_model.meta import UniqueKey
1718
from quantulum3 import parser as q_parser
1819
from dataclasses import dataclass, field
1920

@@ -63,17 +64,46 @@ class CsvDataGeneralizer(Generalizer):
6364
"""
6465

6566
column_separator: str = "\t"
67+
"""character that separates columns in the input file"""
68+
6669
schema_name: str = 'example'
70+
"""LinkML schema name (no spaces)"""
71+
6772
robot: bool = False
73+
"""If true, conforms to robot template format. Data dictionary rows start with '>'"""
74+
75+
data_dictionary_row_count: int = field(default=0)
76+
"""number of rows after header containing data dictionary information"""
77+
6878
enum_columns: List[str] = field(default_factory=lambda: [])
79+
"""List of columns that are coerced into enums"""
80+
6981
enum_mask_columns: List[str] = field(default_factory=lambda: [])
82+
"""List of columns that are excluded from being enums"""
83+
7084
enum_threshold: float = 0.1
85+
"""If number if distinct values divided by total number of values is greater than this, then the column is considered an enum"""
86+
7187
enum_strlen_threshold: int = 30
88+
"""Maximimum length of a string to be considered a permissible enum value"""
89+
7290
max_enum_size: int = 50
91+
"""Max number of permissible values for a column to be considered an enum"""
92+
7393
downcase_header: bool = False
94+
"""If true, coerce column names to be lower case"""
95+
7496
infer_foreign_keys: bool = False
75-
max_pk_len: int = 60 # URIs can be long..
97+
"""For multi-CVS files, infer linkages between rows"""
98+
99+
max_pk_len: int = 60
100+
"""Maximum length to be considered for a primary key column. Note: URIs can be long"""
101+
76102
min_distinct_fk_val: int = 8
103+
"""For inferring foreign keys, there must be a minimum number."""
104+
105+
source_schema: Optional[SchemaDefinition] = None
106+
"""Optional base schema to draw from"""
77107

78108
def infer_linkages(self, files: List[str], **kwargs) -> List[ForeignKey]:
79109
"""
@@ -297,14 +327,31 @@ def convert_dicts(self,
297327
rr: List[Dict],
298328
schema_name: str = 'example',
299329
class_name: str = DEFAULT_CLASS_NAME,
300-
**kwargs) -> SchemaDefinition:
330+
**kwargs) -> Optional[SchemaDefinition]:
331+
"""
332+
Converts a list of row objects to a schema.
333+
334+
Each row is a data item, presumed to be of the same type,
335+
that is generalized.
336+
337+
:param rr:
338+
:param schema_name:
339+
:param class_name:
340+
:param kwargs:
341+
:return:
342+
"""
301343
slots = {}
302-
slot_values = {}
344+
345+
slot_distinct_values: Dict[str, Set[Any]] = {}
346+
"""distinct values for each slot"""
347+
348+
slot_values: Dict[str, List[Any]] = defaultdict(list)
349+
"""all values for each slot"""
350+
303351
n = 0
304352
enums = {}
305353
robot_defs = {}
306354
slot_usage = {}
307-
types = {}
308355
enum_columns = self.enum_columns
309356
enum_mask_columns = self.enum_mask_columns
310357
if len(rr) == 0:
@@ -317,6 +364,14 @@ def convert_dicts(self,
317364
for k, v in row.items():
318365
robot_defs[k] = v
319366
continue
367+
if n <= self.data_dictionary_row_count:
368+
if self.source_schema is None:
369+
self.source_schema = SchemaDefinition(id="auto", name="auto")
370+
for k, v in row.items():
371+
if k not in self.source_schema.slots:
372+
self.source_schema.slots[k] = SlotDefinition(k)
373+
self.source_schema.slots[k].description = v
374+
continue
320375
for k, v in row.items():
321376
if k is None or k == '':
322377
continue
@@ -332,22 +387,44 @@ def convert_dicts(self,
332387
vs = [v]
333388
if k not in slots:
334389
slots[k] = {'range': None}
335-
slot_values[k] = set()
390+
slot_distinct_values[k] = set()
336391
if v is not None and v != "" and not str(v).startswith('$ref:'):
337392
slots[k]['examples'] = [{'value': v}]
338-
slot_values[k].update(vs)
393+
slot_distinct_values[k].update(vs)
394+
slot_values[k] += vs
339395
if len(vs) > 1:
340396
slots[k]['multivalued'] = True
341397
types = {}
342398
new_slots = {}
399+
col_number = 0
400+
unique_keys = []
343401
for sn, s in slots.items():
344-
vals = slot_values[sn]
402+
col_number += 1
403+
is_unique = len(set(slot_values[sn])) == len(slot_values[sn])
404+
is_pk = is_unique and col_number == 1
405+
if self.source_schema and sn in self.source_schema.slots and self.source_schema.slots[sn].identifier:
406+
is_pk = True
407+
if is_pk:
408+
s['identifier'] = True
409+
elif is_unique:
410+
unique_keys.append(sn)
411+
vals = slot_distinct_values[sn]
412+
if self.source_schema:
413+
if sn in self.source_schema.slots:
414+
s['description'] = self.source_schema.slots[sn].description
345415
s['range'] = infer_range(s, vals, types)
416+
logging.info(f"Slot {sn} has range {s['range']}")
346417
if (s['range'] == 'string' or sn in enum_columns) and sn not in enum_mask_columns:
418+
filtered_vals = \
419+
[v
420+
for v in slot_values[sn]
421+
if not isinteger(v) and not isfloat(v) and not isboolean(v) and not is_date(v)]
422+
n_filtered_vals = len(filtered_vals) + 1
347423
n_distinct = len(vals)
348424
longest = max([len(str(v)) for v in vals]) if n_distinct > 0 else 0
425+
logging.info(f"Considering {sn} as enum: {n_distinct} distinct values / {n_filtered_vals}, longest={longest}")
349426
if sn in enum_columns or \
350-
((n_distinct / n) < self.enum_threshold and 0 < n_distinct <= self.max_enum_size
427+
((n_distinct / n_filtered_vals) < self.enum_threshold and 0 < n_distinct <= self.max_enum_size
351428
and longest < self.enum_strlen_threshold):
352429
enum_name = sn.replace(' ', '_').replace('(s)', '')
353430
enum_name = f'{enum_name}_enum'
@@ -416,6 +493,9 @@ def convert_dicts(self,
416493
for sn, s in new_slots.items():
417494
if sn not in slots:
418495
slots[sn] = s
496+
497+
unique_keys = [UniqueKey(f"{k}_key",
498+
unique_key_slots=[k]) for k in unique_keys]
419499
schema = SchemaDefinition(
420500
id=f'https://w3id.org/{schema_name}',
421501
name=schema_name,
@@ -426,7 +506,9 @@ def convert_dicts(self,
426506
classes=[
427507
ClassDefinition(class_name,
428508
slots=class_slots,
429-
slot_usage=slot_usage)
509+
slot_usage=slot_usage,
510+
unique_keys=unique_keys,
511+
)
430512
],
431513
slots=slots,
432514
enums=enums
@@ -465,6 +547,16 @@ def isfloat(value):
465547
except ValueError:
466548
return False
467549

550+
def isinteger(value):
551+
try:
552+
int(value)
553+
return True
554+
except ValueError:
555+
return False
556+
557+
def isboolean(value):
558+
return value in ['true', 'false']
559+
468560

469561
def is_measurement(value):
470562
ms = q_parser.parse(value)
@@ -503,8 +595,18 @@ def is_all_measurement(values):
503595
return False
504596

505597

506-
def infer_range(slot: dict, vals: set, types: dict) -> str:
598+
def infer_range(slot: dict, vals: set, types: dict, coerce=True) -> str:
599+
"""
600+
Infers the range of a slot based on the values
601+
602+
:param slot:
603+
:param vals:
604+
:param types:
605+
:return:
606+
"""
607+
logging.info(f"Inferring value for {list(vals)[0:5]}...")
507608
nn_vals = [v for v in vals if v is not None and v != ""]
609+
logging.info(f"FILTERED: {list(nn_vals)[0:5]}...")
508610
if len(nn_vals) == 0:
509611
return 'string'
510612
if all(str(v).startswith('$ref:') for v in nn_vals):
@@ -513,12 +615,15 @@ def infer_range(slot: dict, vals: set, types: dict) -> str:
513615
return 'integer'
514616
if all(isinstance(v, float) for v in nn_vals):
515617
return 'float'
516-
if all(str(v).isdigit() for v in nn_vals):
517-
return 'integer'
518-
if all(is_date(v) for v in nn_vals):
519-
return 'datetime'
520-
if all(isfloat(v) for v in nn_vals):
521-
return 'float'
618+
if coerce:
619+
if all(isinteger(v) for v in nn_vals):
620+
return 'integer'
621+
if all(isboolean(v) for v in nn_vals):
622+
return 'boolean'
623+
if all(isfloat(v) for v in nn_vals):
624+
return 'float'
625+
if all(is_date(v) for v in nn_vals):
626+
return 'datetime'
522627
if is_all_measurement(nn_vals):
523628
return 'measurement'
524629
v0 = nn_vals[0]
@@ -535,12 +640,17 @@ def infer_range(slot: dict, vals: set, types: dict) -> str:
535640
return 'string'
536641

537642

538-
def get_db(db_id: str) -> str:
539-
parts = db_id.split(':')
540-
if len(parts) > 1:
541-
return parts[0]
542-
else:
543-
return None
643+
def get_db(db_id: str) -> Optional[str]:
644+
"""
645+
Extracts the database from a CURIE
646+
647+
:param db_id:
648+
:return:
649+
"""
650+
if isinstance(db_id, str) and ':' in db_id:
651+
parts = db_id.split(':')
652+
if len(parts) > 1:
653+
return parts[0]
544654

545655

546656
def is_date(string, fuzzy=False):

tests/resources/bio.obo

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
format-version: 1.2
2+
ontology: bio
3+
4+
[Term]
5+
id: BIO:1
6+
name: biochemical reaction
7+
def: "A biochemical reaction" []
8+
9+
[Term]
10+
id: BIO:2
11+
name: chemical structure
12+
def: "A chemical structure" []
13+

0 commit comments

Comments
 (0)