|
8 | 8 | import os |
9 | 9 | from dataclasses import dataclass |
10 | 10 | from pprint import pprint |
11 | | -from typing import Any, List, Dict, Union |
| 11 | +from typing import Any, List, Dict, Union, Iterator |
12 | 12 |
|
13 | 13 | from linkml_runtime.linkml_model import SchemaDefinition |
14 | | -from linkml_runtime.utils.schemaview import SchemaView |
| 14 | +from linkml_runtime.utils.metamodelcore import Curie |
| 15 | +from linkml_runtime.utils.schemaview import SchemaView, re |
| 16 | +from oaklib import BasicOntologyInterface |
| 17 | +from oaklib.datamodels.search import SearchConfiguration |
| 18 | +from oaklib.datamodels.text_annotator import TextAnnotation |
| 19 | +from oaklib.interfaces import SearchInterface |
| 20 | +from oaklib.interfaces.text_annotator_interface import TextAnnotatorInterface |
15 | 21 |
|
16 | 22 | from schema_automator.utils.schemautils import minify_schema |
17 | 23 |
|
18 | 24 | REST_URL = "http://data.bioontology.org" |
| 25 | +camel_case_pattern = re.compile(r'(?<!^)(?=[A-Z])') |
19 | 26 |
|
20 | | -ANNOTATION = Dict[str, Any] |
21 | | - |
22 | | -@dataclass |
23 | | -class Term: |
24 | | - id: str |
25 | | - prefLabel: str |
26 | | - synonyms: List[str] = None |
27 | | - definition: str = None |
28 | | - semanticType: str = None |
29 | | - cui: str = None |
30 | | - |
31 | | -@dataclass |
32 | | -class Annotation: |
33 | | - start_position: int |
34 | | - end_position: int |
35 | | - matchType: str |
36 | | - text: str |
37 | | - source: str |
38 | | - |
39 | | - def complete(self) -> bool: |
40 | | - return len(self.source) == (self.end_position - self.start_position) + 1 |
41 | | - |
42 | | -@dataclass |
43 | | -class Result: |
44 | | - annotatedClass: Term |
45 | | - annotations: List[Annotation] = None |
46 | | - mappings: List = None |
47 | | - |
48 | | - def complete(self) -> bool: |
49 | | - return any(a for a in self.annotations if a.complete()) |
50 | | - |
51 | | -@dataclass |
52 | | -class ResultSet: |
53 | | - results: List[Result] = None |
| 27 | +def uncamel(n: str): |
| 28 | + return camel_case_pattern.sub(' ', n).lower().replace('_', ' ') |
54 | 29 |
|
55 | 30 | @dataclass |
56 | 31 | class SchemaAnnotator: |
57 | | - bioportal_api_key: str = None |
58 | | - |
59 | | - def load_bioportal_api_key(self, path: str = None) -> None: |
60 | | - if path is None: |
61 | | - path = os.path.join('conf', 'bioportal_apikey.txt') |
62 | | - with open(path) as stream: |
63 | | - lines = stream.readlines() |
64 | | - key = lines[0].strip() |
65 | | - self.bioportal_api_key = key |
66 | | - |
67 | | - def get_json(self, url) -> Any: |
68 | | - opener = urllib.request.build_opener() |
69 | | - opener.addheaders = [('Authorization', 'apikey token=' + API_KEY)] |
70 | | - return json.loads(opener.open(url).read()) |
71 | | - |
72 | | - def annotate_text(self, text, include: List = None, require_exact_match=True) -> ResultSet: |
73 | | - logging.info(f'Annotating text: {text}') |
74 | | - if include is None: |
75 | | - include =['prefLabel', 'synonym', 'definition', 'semanticType', 'cui'] |
76 | | - include_str = ','.join(include) |
77 | | - params = {'include': include_str, |
78 | | - 'require_exact_match': require_exact_match, |
79 | | - 'text': text} |
80 | | - if self.bioportal_api_key is None: |
81 | | - self.load_bioportal_api_key() |
82 | | - r = requests.get(REST_URL + '/annotator', |
83 | | - headers={'Authorization': 'apikey token=' + self.bioportal_api_key}, |
84 | | - params=params) |
85 | | - #return r.json() |
86 | | - return self.json_to_results(r.json(), text) |
87 | | - |
88 | | - def json_to_results(self, json_list: List[Any], text: str) -> ResultSet: |
89 | | - results = [] |
90 | | - for obj in json_list: |
91 | | - #print(f'JSON: {obj}') |
92 | | - ac_obj = obj['annotatedClass'] |
93 | | - ac = Term(id=ac_obj['@id'], prefLabel=ac_obj.get('prefLabel', None)) |
94 | | - anns = [Annotation(start_position=x['from'], |
95 | | - end_position=x['to'], |
96 | | - matchType=x['matchType'], |
97 | | - text=x['text'], |
98 | | - source=text) for x in obj['annotations']] |
99 | | - r = Result(annotatedClass=ac, annotations=anns) |
100 | | - logging.debug(f'RESULT: {r}') |
101 | | - results.append(r) |
102 | | - return ResultSet(results) |
103 | | - |
104 | | - def annotate_schema(self, schema: Union[SchemaDefinition, str], match_only=True) -> SchemaDefinition: |
| 32 | + ontology_implementation: BasicOntologyInterface |
| 33 | + |
| 34 | + def annotate_text(self, text: str) -> Iterator[TextAnnotation]: |
| 35 | + # this is a wrapper over OAK annotation and search; |
| 36 | + # it (1) expands CamelCase (2) abstracts over annotation vs search |
| 37 | + # TODO: fold this functionality back into OAK |
| 38 | + oi = self.ontology_implementation |
| 39 | + text_exp = uncamel(text) |
| 40 | + if isinstance(oi, TextAnnotatorInterface): |
| 41 | + # TextAnnotation is available; use this by default |
| 42 | + for r in oi.annotate_text(text_exp): |
| 43 | + yield r |
| 44 | + if text_exp != text.lower(): |
| 45 | + for r in oi.annotate_text(text_exp): |
| 46 | + yield r |
| 47 | + elif isinstance(oi, SearchInterface): |
| 48 | + # use search as an alternative |
| 49 | + cfg = SearchConfiguration(is_complete=True) |
| 50 | + for r in oi.basic_search(text, config=cfg): |
| 51 | + yield TextAnnotation(object_id=r, matches_whole_text=True) |
| 52 | + if text_exp != text.lower(): |
| 53 | + for r in oi.basic_search(text_exp, config=cfg): |
| 54 | + yield TextAnnotation(object_id=r, matches_whole_text=True) |
| 55 | + else: |
| 56 | + raise NotImplementedError |
| 57 | + |
| 58 | + def annotate_schema(self, schema: Union[SchemaDefinition, str], curie_only=True) -> SchemaDefinition: |
105 | 59 | """ |
106 | 60 | Annotate all elements of a schema, adding mappings |
107 | 61 | """ |
108 | 62 | sv = SchemaView(schema) |
| 63 | + oi = self.ontology_implementation |
109 | 64 | for elt_name, elt in sv.all_elements().items(): |
110 | 65 | for n in [elt.name] + elt.aliases: |
111 | | - rs = self.annotate_text(n, require_exact_match=True) |
112 | | - for r in rs.results: |
113 | | - if r.complete(): |
114 | | - xref = r.annotatedClass.id |
| 66 | + for r in self.annotate_text(n): |
| 67 | + logging.debug(f'MATCH: {r}') |
| 68 | + if r.matches_whole_text: |
| 69 | + xref = r.object_id |
| 70 | + if curie_only and not Curie.is_curie(xref): |
| 71 | + continue |
115 | 72 | logging.info(f'Mapping from {elt_name} "{n}" to {xref}') |
116 | 73 | if xref not in elt.exact_mappings: |
117 | 74 | elt.exact_mappings.append(xref) |
| 75 | + for e in sv.all_enums().values(): |
| 76 | + for pv in e.permissible_values.values(): |
| 77 | + for r in self.annotate_text(pv.text): |
| 78 | + logging.debug(f'MATCH: {r}') |
| 79 | + if r.matches_whole_text: |
| 80 | + xref = r.object_id |
| 81 | + if curie_only and not Curie.is_curie(xref): |
| 82 | + continue |
| 83 | + logging.info(f'Mapping from {elt_name} "{n}" to {xref}') |
| 84 | + if pv.meaning is None: |
| 85 | + logging.info(f'Arbitrarily choosing first match: {xref}') |
| 86 | + pv.meaning = xref |
| 87 | + else: |
| 88 | + if xref not in pv.exact_mappings: |
| 89 | + pv.exact_mappings.append(xref) |
| 90 | + |
118 | 91 | return sv.schema |
119 | 92 |
|
120 | 93 |
|
121 | 94 | @click.command() |
122 | 95 | @click.argument('schema') |
| 96 | +@click.option('--input', '-i', help="OAK input ontology selector") |
123 | 97 | @click.option('--output', '-o', help="Path to saved yaml schema") |
124 | | -def annotate_schema(schema: str, output: str, **args): |
| 98 | +def annotate_schema(schema: str, input: str, output: str, **args): |
125 | 99 | """ |
126 | 100 | Annotate all elements of a schema |
127 | 101 | """ |
|
0 commit comments