Skip to content

Commit da64950

Browse files
Merge pull request #161 from linkml/fix_dates_160
Fix date range without times
2 parents 4d81e26 + 7173507 commit da64950

File tree

8 files changed

+655
-28
lines changed

8 files changed

+655
-28
lines changed

.github/workflows/check-pull-request.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,18 @@ jobs:
1616
strategy:
1717
fail-fast: false
1818
matrix:
19-
os: [ ubuntu-latest, windows-latest ]
19+
# os: [ ubuntu-latest, windows-latest ] - Exclude windows for now
20+
os: [ ubuntu-latest ]
2021
python-version: [ "3.9", "3.10" ]
2122
exclude:
2223
- os: windows-latest
2324
python-version: "3.9"
2425

2526
runs-on: ${{ matrix.os }}
2627

28+
# Allow Python 3.13 to fail due to scipy not being available
29+
continue-on-error: ${{ matrix.python-version == '3.13' }}
30+
2731
steps:
2832

2933
#----------------------------------------------
@@ -57,7 +61,7 @@ jobs:
5761
#----------------------------------------------
5862
- name: Load cached venv
5963
id: cached-poetry-dependencies
60-
uses: actions/cache@v2
64+
uses: actions/cache@v3
6165
with:
6266
path: .venv
6367
key: venv-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,5 @@ tests/outputs/*
1414
venv/
1515
.venv/
1616
target/
17-
local/
17+
local/
18+
.python-version

poetry.lock

Lines changed: 570 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ pydbml = "^1.1.2"
6060
pyyaml = "^6.0.2"
6161
llm = {version = "^0.21", optional = true}
6262

63-
[tool.poetry.dev-dependencies]
63+
[tool.poetry.group.dev.dependencies]
6464
pytest = ">=7.1.1"
6565
Sphinx = ">=4.4.0"
6666
sphinx-pdj-theme = ">=0.2.1"

schema_automator/generalizers/csv_data_generalizer.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import datetime
12
import click
23
import logging
34
import yaml
@@ -644,7 +645,11 @@ def infer_range(slot: dict, vals: set, types: dict, coerce=True) -> str:
644645
return 'boolean'
645646
if all(isfloat(v) for v in nn_vals):
646647
return 'float'
647-
if all(is_date(v) for v in nn_vals):
648+
parsed_datetimes = [is_date_or_datetime(v) for v in nn_vals]
649+
if all(pd == 'date' for pd in parsed_datetimes):
650+
return 'date'
651+
if all(pd in ('date', 'datetime') for pd in parsed_datetimes):
652+
# This selects datetime when values are mixed which may fail validation
648653
return 'datetime'
649654
if is_all_measurement(nn_vals):
650655
return 'measurement'
@@ -691,6 +696,24 @@ def is_date(string, fuzzy=False):
691696
return False
692697

693698

699+
def is_date_or_datetime(string, fuzzy=False):
700+
"""
701+
Return whether the string can be interpreted as a date or datetime.
702+
703+
:param string: str, string to check for date
704+
:param fuzzy: bool, ignore unknown tokens in string if True
705+
"""
706+
try:
707+
dt = parse(string, fuzzy=fuzzy)
708+
if dt.hour == 0 and dt.minute == 0 and dt.second == 0:
709+
return 'date'
710+
return 'datetime'
711+
except Exception:
712+
# https://stackoverflow.com/questions/4990718/how-can-i-write-a-try-except-block-that-catches-all-exceptions
713+
# we don't know all the different parse exceptions, we assume any error means this is not a date
714+
return False
715+
716+
694717
@dataclass
695718
class Hit:
696719
term_id: str

schema_automator/importers/rdfs_import_engine.py

Lines changed: 44 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import logging
22
from pathlib import Path
3-
from typing import Dict, Iterable, List, Any, Mapping, TextIO
3+
from typing import Any, Dict, Iterable, List, Mapping, Optional, TextIO, Union
44
import typing
55
from collections import defaultdict, Counter
6+
import warnings
67

78
from jsonasobj2 import JsonObj
89
from linkml.utils.schema_builder import SchemaBuilder
@@ -51,7 +52,7 @@ class RdfsImportEngine(ImportEngine):
5152
#: Mapping from field names in this RDF schema (e.g. `price`) to IRIs (e.g. `http://schema.org/price`)
5253
mappings: Dict[str, URIRef] = field(default_factory=dict)
5354
#: User-defined mapping from LinkML metamodel slots (such as `domain_of`) to RDFS IRIs (such as http://schema.org/domainIncludes)
54-
initial_metamodel_mappings: Dict[str, URIRef | List[URIRef]] = field(default_factory=dict)
55+
initial_metamodel_mappings: Dict[str, Union[URIRef, List[URIRef]]] = field(default_factory=dict)
5556
#: Combined mapping from LinkML metamodel slots to RDFS IRIs
5657
metamodel_mappings: Dict[str, List[URIRef]] = field(default_factory=lambda: defaultdict(list))
5758
#: Reverse of `metamodel_mappings`, but supports multiple terms mapping to the same IRI
@@ -97,12 +98,12 @@ def __post_init__(self):
9798

9899
def convert(
99100
self,
100-
file: str | Path | TextIO,
101-
name: str | None = None,
102-
format: str | None="turtle",
103-
default_prefix: str | None = None,
104-
model_uri: str | None = None,
105-
identifier: str | None = None,
101+
file: Union[str, Path, TextIO],
102+
name: Optional[str] = None,
103+
format: Optional[str] = "turtle",
104+
default_prefix: Optional[str] = None,
105+
model_uri: Optional[str] = None,
106+
identifier: Optional[str] = None,
106107
**kwargs: Any,
107108
) -> SchemaDefinition:
108109
"""
@@ -130,7 +131,10 @@ def convert(
130131
cls_slots = defaultdict(list)
131132

132133
for slot in self.generate_rdfs_properties(g, cls_slots):
133-
sb.add_slot(slot)
134+
if slot.name in sb.schema.slots:
135+
logging.warning(f"Slot '{slot.name}' already exists in schema; skipping duplicate.")
136+
else:
137+
sb.add_slot(slot)
134138
for cls in self.process_rdfs_classes(g, cls_slots):
135139
sb.add_class(cls)
136140

@@ -151,9 +155,16 @@ def convert(
151155
schema.prefixes = {key: value for key, value in schema.prefixes.items() if key in self.seen_prefixes}
152156
self.infer_metadata(schema, name, default_prefix, model_uri)
153157
self.fix_missing(schema)
158+
self._normalize_slot_ranges(schema)
154159
return schema
155160

156-
def infer_metadata(self, schema: SchemaDefinition, name: str | None, default_prefix: str | None = None, model_uri: str | None = None):
161+
def infer_metadata(
162+
self,
163+
schema: SchemaDefinition,
164+
name: Optional[str] = None,
165+
default_prefix: Optional[str] = None,
166+
model_uri: Optional[str] = None,
167+
):
157168
top_count = self.prefix_counts.most_common(1)
158169
if len(top_count) == 0:
159170
raise ValueError("No prefixes found in the graph")
@@ -313,7 +324,7 @@ def _dict_for_subject(self, g: Graph, s: URIRef, subject_type: typing.Literal["s
313324
def _rdfs_metamodel_iri(self, name: str) -> List[URIRef]:
314325
return self.metamodel_mappings.get(name, [])
315326

316-
def _element_from_iri(self, iri: URIRef) -> str | None:
327+
def _element_from_iri(self, iri: URIRef) -> Optional[str]:
317328
r = self.reverse_metamodel_mappings.get(iri, [])
318329
if len(r) > 0:
319330
if len(r) > 1:
@@ -341,3 +352,25 @@ def _as_name(self, v: URIRef) -> str:
341352
if sep in v_str:
342353
return v_str.split(sep)[-1]
343354
return v_str
355+
356+
def _normalize_slot_ranges(self, schema: SchemaDefinition) -> None:
357+
"""
358+
Normalize slot ranges to valid LinkML scalars where needed.
359+
Currently supports remapping RDF types like 'langString'.
360+
"""
361+
RDF_DATATYPE_MAP = {
362+
"langString": "string",
363+
"Text": "string",
364+
"Thing": "string",
365+
"landingPage": "string",
366+
"Boolean": "boolean",
367+
"Number": "integer",
368+
"URL": "uri",
369+
}
370+
371+
for slot in schema.slots.values():
372+
if slot.range in RDF_DATATYPE_MAP:
373+
warnings.warn(
374+
f"Slot '{slot.name}' has unsupported range '{slot.range}'; mapping to '{RDF_DATATYPE_MAP[slot.range]}'."
375+
)
376+
slot.range = RDF_DATATYPE_MAP[slot.range]

tests/test_generalizers/test_csv_data_generalizer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@ def test_infer_range(self):
6868
(['5.999', '7.955', '7.990', '6.990'], "float"),
6969
(["2mm", "3m", "4 mm"], "measurement"),
7070
(["true", "false"], "boolean"),
71+
(["2024-01-01", "2023-12-31"], "date"),
72+
(["2024-01-01T12:30:00", "2023-12-31T08:15:00"], "datetime"),
73+
(["2024-01-01", "2023-12-31T08:15:00"], "datetime"),
74+
(["2024-01-01", "not-a-date"], "string"),
7175
]
7276
for values, expected in cases:
7377
self.assertEqual(infer_range({}, values, {}), expected, f"Failed on {values}")

tests/test_importers/test_rdfs_importer.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from io import StringIO
66
import unittest
77
import os
8+
import pytest
89
import yaml
910
from linkml_runtime import SchemaView
1011

@@ -80,6 +81,6 @@ def test_from_rdfs():
8081
assert activity.name == "Activity"
8182
assert activity.is_a == "CreativeWork"
8283
slots = sv.class_induced_slots(activity.name)
83-
assert len(slots) == 1
84-
slot = slots[0]
85-
assert slot.name == "id"
84+
assert len(slots) == 18
85+
slot_names = [s.name for s in slots]
86+
assert "messages" in slot_names

0 commit comments

Comments
 (0)