Skip to content

Commit 9c36fa4

Browse files
committed
Infer schema metadata from RDFS
1 parent 9a65077 commit 9c36fa4

File tree

2 files changed

+71
-25
lines changed

2 files changed

+71
-25
lines changed

schema_automator/importers/rdfs_import_engine.py

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
import logging
2-
from typing import Dict, Iterable, List, Any
2+
from pathlib import Path
3+
from typing import Dict, Iterable, List, Any, Mapping, TextIO
34
import typing
4-
from collections import defaultdict
5+
from collections import defaultdict, Counter
56

7+
from jsonasobj2 import JsonObj
68
from linkml.utils.schema_builder import SchemaBuilder
79
from linkml_runtime import SchemaView
810
from linkml_runtime.linkml_model import (
911
SchemaDefinition,
1012
SlotDefinition,
1113
ClassDefinition,
1214
Prefix,
13-
Uriorcurie
1415
)
1516

1617
from dataclasses import dataclass, field
@@ -24,6 +25,8 @@
2425
HTTP_SDO = Namespace("http://schema.org/")
2526

2627
DEFAULT_METAMODEL_MAPPINGS: Dict[str, List[URIRef]] = {
28+
# See https://github.com/linkml/linkml/issues/2507
29+
"description": [RDFS.comment],
2730
"is_a": [RDFS.subClassOf, SKOS.broader],
2831
"domain_of": [HTTP_SDO.domainIncludes, SDO.domainIncludes, RDFS.domain],
2932
"range": [HTTP_SDO.rangeIncludes, SDO.rangeIncludes, RDFS.range],
@@ -55,9 +58,12 @@ class RdfsImportEngine(ImportEngine):
5558
reverse_metamodel_mappings: Dict[URIRef, List[str]] = field(default_factory=lambda: defaultdict(list))
5659
#: The names of LinkML ClassDefinition slots
5760
classdef_slots: set[str] = field(init=False)
58-
#: The names of LinkML SlotDefinition slot slots
61+
#: The names of LinkML SlotDefinition slots
5962
slotdef_slots: set[str] = field(init=False)
63+
#: Every prefix seen in the graph
6064
seen_prefixes: set[str] = field(default_factory=set)
65+
#: The counts of each prefix, used to infer the default prefix
66+
prefix_counts: Counter[str] = field(default_factory=Counter)
6167

6268
def __post_init__(self):
6369
sv = package_schemaview("linkml_runtime.linkml_model.meta")
@@ -91,7 +97,7 @@ def __post_init__(self):
9197

9298
def convert(
9399
self,
94-
file: str,
100+
file: str | Path | TextIO,
95101
name: str | None = None,
96102
format: str | None="turtle",
97103
default_prefix: str | None = None,
@@ -101,23 +107,10 @@ def convert(
101107
) -> SchemaDefinition:
102108
"""
103109
Converts an OWL schema-style ontology
104-
105-
:param file:
106-
:param name:
107-
:param model_uri:
108-
:param identifier:
109-
:param kwargs:
110-
:return:
111110
"""
112111
g = Graph(bind_namespaces="none")
113112
g.parse(file, format=format)
114-
if name is not None and default_prefix is None:
115-
default_prefix = name
116-
if name is None:
117-
name = default_prefix
118-
if name is None:
119-
name = "example"
120-
sb = SchemaBuilder(name=name)
113+
sb = SchemaBuilder()
121114
sb.add_defaults()
122115
schema = sb.schema
123116
for k, v in g.namespaces():
@@ -153,13 +146,29 @@ def convert(
153146
# Remove prefixes that aren't used
154147
if isinstance(schema.imports, list):
155148
for imp in schema.imports:
156-
prefix, suffix = imp.split(":", 1)
149+
prefix, _suffix = imp.split(":", 1)
157150
self.seen_prefixes.add(prefix)
158151
schema.prefixes = {key: value for key, value in schema.prefixes.items() if key in self.seen_prefixes}
159-
152+
self.infer_metadata(schema, name, default_prefix, model_uri)
160153
self.fix_missing(schema)
161154
return schema
162155

156+
def infer_metadata(self, schema: SchemaDefinition, name: str | None, default_prefix: str | None = None, model_uri: str | None = None):
157+
top_count = self.prefix_counts.most_common(1)
158+
if len(top_count) == 0:
159+
raise ValueError("No prefixes found in the graph")
160+
inferred_prefix = top_count[0][0]
161+
162+
schema.name = name or inferred_prefix
163+
schema.default_prefix = default_prefix or inferred_prefix
164+
prefix_uri = None
165+
if isinstance(schema.prefixes, Mapping):
166+
prefix_uri = schema.prefixes.get(inferred_prefix)
167+
elif isinstance(schema.prefixes, JsonObj):
168+
prefix_uri = schema.prefixes._get(inferred_prefix)
169+
if isinstance(prefix_uri, Prefix):
170+
schema.id = model_uri or prefix_uri.prefix_reference
171+
163172
def fix_missing(self, schema: SchemaDefinition) -> None:
164173
"""
165174
For some properties we have a `subproperty_of` that references a slot that doesn't exist.
@@ -181,8 +190,9 @@ def track_uri(self, uri: str, g: Graph) -> None:
181190
"""
182191
Updates the set of prefixes seen in the graph
183192
"""
184-
prefix, namespace, name = g.namespace_manager.compute_qname(uri)
193+
prefix, _namespace, _name = g.namespace_manager.compute_qname(uri)
185194
self.seen_prefixes.add(prefix)
195+
self.prefix_counts.update([prefix])
186196

187197
def process_rdfs_classes(
188198
self,

tests/test_importers/test_rdfs_importer.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
"""Test the module can be imported."""
44

5+
from io import StringIO
56
import unittest
67
import os
78
import yaml
@@ -26,9 +27,45 @@ def test_import_foaf():
2627
assert len(sv.all_classes()) == 3
2728
assert len(sv.all_slots()) == 1
2829
assert sv.get_slot("knows").range == "Person"
29-
assert sv.schema.default_prefix == "example"
30-
assert "example" in sv.schema.prefixes
30+
assert sv.schema.default_prefix == "foaf"
31+
assert "foaf" in sv.schema.prefixes
3132

33+
def test_comment_description():
34+
"""
35+
rdfs:comment should be converted to description
36+
"""
37+
rdf = StringIO("""
38+
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
39+
40+
<http://example.org/Class> a rdfs:Class ;
41+
rdfs:comment "A class." .
42+
""")
43+
engine = RdfsImportEngine()
44+
schema = engine.convert(rdf)
45+
sv = SchemaView(schema)
46+
cls = sv.get_class("Class")
47+
assert cls.description == "A class."
48+
49+
def test_infer_prefix():
50+
"""
51+
If the schema has no name, id or default prefix, the importer should infer them from prefix usage in the schema.
52+
"""
53+
rdf = StringIO("""
54+
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
55+
@prefix foo: <https://foo.com> .
56+
57+
foo:Class a rdfs:Class ;
58+
rdfs:comment "A class." .
59+
60+
foo:prop a rdfs:Property ;
61+
rdfs:comment "A property." .
62+
""")
63+
engine = RdfsImportEngine()
64+
schema = engine.convert(rdf)
65+
# Although not explicitly provided, the importer should realise that the prefix is "foo"
66+
assert schema.default_prefix == "foo"
67+
assert schema.id == "https://foo.com"
68+
assert schema.name == "foo"
3269

3370
def test_from_rdfs():
3471
"""Test OWL conversion."""
@@ -37,7 +74,6 @@ def test_from_rdfs():
3774
write_schema(schema, OUTSCHEMA)
3875
# roundtrip
3976
s = YAMLGenerator(OUTSCHEMA).serialize()
40-
print(s[0:100])
4177
sv = SchemaView(OUTSCHEMA)
4278
activity = sv.get_class("Activity")
4379
assert activity

0 commit comments

Comments
 (0)