diff --git a/docs/packages/importers.rst b/docs/packages/importers.rst index 0aa4546..e12f859 100644 --- a/docs/packages/importers.rst +++ b/docs/packages/importers.rst @@ -77,6 +77,13 @@ NCI implements a JSON serialization of ISO-11197. You can import this JSON and c schemauto import-cadsr "cdes/*.json" +Importing from DBML +-------------------- + +DBML is a simple DSL for defining database schemas. It is a subset of SQL DDL. + + + Packages for importing ---------------------- diff --git a/poetry.lock b/poetry.lock index dd714bc..26b1c2b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "airium" @@ -2459,13 +2459,9 @@ files = [ {file = "lxml-5.2.2-cp36-cp36m-win_amd64.whl", hash = "sha256:edcfa83e03370032a489430215c1e7783128808fd3e2e0a3225deee278585196"}, {file = "lxml-5.2.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:28bf95177400066596cdbcfc933312493799382879da504633d16cf60bba735b"}, {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3a745cc98d504d5bd2c19b10c79c61c7c3df9222629f1b6210c0368177589fb8"}, - {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b590b39ef90c6b22ec0be925b211298e810b4856909c8ca60d27ffbca6c12e6"}, {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b336b0416828022bfd5a2e3083e7f5ba54b96242159f83c7e3eebaec752f1716"}, - {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:c2faf60c583af0d135e853c86ac2735ce178f0e338a3c7f9ae8f622fd2eb788c"}, {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:4bc6cb140a7a0ad1f7bc37e018d0ed690b7b6520ade518285dc3171f7a117905"}, - {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7ff762670cada8e05b32bf1e4dc50b140790909caa8303cfddc4d702b71ea184"}, {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:57f0a0bbc9868e10ebe874e9f129d2917750adf008fe7b9c1598c0fbbfdde6a6"}, - {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:a6d2092797b388342c1bc932077ad232f914351932353e2e8706851c870bca1f"}, {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:60499fe961b21264e17a471ec296dcbf4365fbea611bf9e303ab69db7159ce61"}, {file = "lxml-5.2.2-cp37-cp37m-win32.whl", hash = "sha256:d9b342c76003c6b9336a80efcc766748a333573abf9350f4094ee46b006ec18f"}, {file = "lxml-5.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:b16db2770517b8799c79aa80f4053cd6f8b716f21f8aca962725a9565ce3ee40"}, @@ -3292,9 +3288,9 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -3792,8 +3788,8 @@ files = [ annotated-types = ">=0.4.0" pydantic-core = "2.20.1" typing-extensions = [ - {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, {version = ">=4.6.1", markers = "python_version < \"3.13\""}, + {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, ] [package.extras] @@ -3900,6 +3896,20 @@ files = [ [package.dependencies] typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" +[[package]] +name = "pydbml" +version = "1.1.2" +description = "Python parser and builder for DBML" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydbml-1.1.2-py3-none-any.whl", hash = "sha256:3d9e36aa130624169c916bfb40926b453ed10f4a8759808befc8197637df9e98"}, + {file = "pydbml-1.1.2.tar.gz", hash = "sha256:5714b49ce3b3b8d246f9b59c8be384736b05bffc336971047f3d2e0ec0aaca75"}, +] + +[package.dependencies] +pyparsing = ">=3.0.0" + [[package]] name = "pygments" version = "2.18.0" @@ -5965,4 +5975,4 @@ mariadb = [] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "036cba73b6fd660157c70cb76be27a501017e8904b35c8d2ccb00d412bbba870" +content-hash = "bf523e82bb08caf05eb970b29b6a68e01a536a14ac38257d27756b869d38f4fe" diff --git a/pyproject.toml b/pyproject.toml index 684e019..745879f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ click-default-group = "^1.2.4" linkml-runtime = "^1.7.2" duckdb = "^0.10.1" numpy = "<2.0" +pydbml = "^1.1.2" [tool.poetry.dev-dependencies] pytest = ">=7.1.1" diff --git a/schema_automator/importers/dbml_import_engine.py b/schema_automator/importers/dbml_import_engine.py new file mode 100644 index 0000000..a40f7d4 --- /dev/null +++ b/schema_automator/importers/dbml_import_engine.py @@ -0,0 +1,95 @@ +from schema_automator.importers.import_engine import ImportEngine +from pydbml import PyDBML +from linkml_runtime.linkml_model import SchemaDefinition, ClassDefinition, SlotDefinition +from dataclasses import dataclass + + +def _map_dbml_type_to_linkml(dbml_type: str) -> str: + """ + Maps DBML data types to LinkML types. + + :param dbml_type: The DBML column type. + :return: Corresponding LinkML type. + """ + type_mapping = { + "int": "integer", + "varchar": "string", + "text": "string", + "float": "float", + "boolean": "boolean", + "date": "date", + "datetime": "datetime", + } + return type_mapping.get(dbml_type.lower(), "string") + + +@dataclass +class DbmlImportEngine(ImportEngine): + """ + An ImportEngine that introspects a DBML schema to determine a corresponding LinkML schema. + """ + + def convert( + self, + file: str, + name: str = None, + model_uri: str = None, + identifier: str = None, + **kwargs + ) -> SchemaDefinition: + """ + Converts a DBML schema file into a LinkML SchemaDefinition. + + :param file: Path to the DBML schema file. + :param name: Optional name for the generated LinkML schema. + :param model_uri: Optional URI for the schema. + :param identifier: Identifier field for the schema. + :return: SchemaDefinition object representing the DBML schema. + """ + # Initialize the schema definition + schema_name = name or "GeneratedSchema" + schema = SchemaDefinition(name=schema_name, id=model_uri or f"https://example.org/{schema_name}") + + # Parse the DBML file + with open(file, 'r', encoding='utf-8') as f: + dbml_content = f.read() + parsed_dbml = PyDBML(dbml_content) + + # Process tables + for table in parsed_dbml.tables: + class_def = ClassDefinition( + name=table.name, + description=table.note or f"Auto-generated class for table '{table.name}'", + slots=[], + unique_keys=[], # Initialize unique keys property + ) + processed_slots = set() # Track processed slot names to avoid duplicates + + # Handle primary key and unique constraints + primary_key_columns = [col for col in table.columns if col.pk] + unique_columns = [col for col in table.columns if col.unique and not col.pk] + + # Process columns + for column in table.columns: + + slot_name = column.name + slot_def = SlotDefinition( + name=slot_name, + range=_map_dbml_type_to_linkml(column.type), + description=column.note or f"Column '{slot_name}'", + required=column in primary_key_columns or column.unique, + identifier=column in primary_key_columns, # Mark primary key columns as identifiers + ) + schema.slots[slot_name] = slot_def + class_def.slots.append(slot_name) + processed_slots.add(slot_name) + + # Handle single unique column as primary key if no explicit primary key exists + if not primary_key_columns and len(unique_columns) == 1: + unique_column = unique_columns[0] + schema.slots[unique_column.name].identifier = True + schema.slots[unique_column.name].required = True + + schema.classes[table.name] = class_def + + return schema diff --git a/tests/test_importers/test_dbml_importer.py b/tests/test_importers/test_dbml_importer.py new file mode 100644 index 0000000..ca3bb70 --- /dev/null +++ b/tests/test_importers/test_dbml_importer.py @@ -0,0 +1,72 @@ +import pytest +from linkml_runtime.linkml_model import SchemaDefinition +from schema_automator.importers.dbml_import_engine import DbmlImportEngine + +# Sample DBML content for testing +DBML_SAMPLE = """ +Table Users { + id int [primary key, not null] + email varchar [unique, not null] + username varchar +} + +Table Orders { + order_id int [not null] + user_id int [not null] + product_id int [not null] + quantity int +} + +Table Countries { + code varchar [primary key, not null] + name varchar [not null] +} +""" + +@pytest.fixture +def dbml_file(tmp_path): + """ + Fixture to create a temporary DBML file. + """ + dbml_path = tmp_path / "test.dbml" + dbml_path.write_text(DBML_SAMPLE) + print(dbml_path) + return dbml_path + +@pytest.fixture +def importer(): + """ + Fixture to initialize the DbmlImportEngine. + """ + return DbmlImportEngine() + +def test_dbml_to_linkml_conversion(dbml_file, importer): + """ + Test the basic conversion of DBML to a LinkML schema. + """ + schema = importer.convert(file=str(dbml_file), name="TestSchema") + + # Assert the schema object is created + assert isinstance(schema, SchemaDefinition) + + # Check that expected classes are present + assert "Users" in schema.classes + assert "Orders" in schema.classes + + # Check that expected slots are present + assert "id" in schema.slots + assert schema.slots["id"].identifier + assert schema.slots["id"].required + + +def test_primary_key_handling(dbml_file, importer): + """ + Test correct handling of primary keys and required attributes. + """ + schema = importer.convert(file=str(dbml_file), name="TestSchema") + + # Check that primary keys are marked as required and identifiers + users_class = schema.classes["Users"] + assert "id" in users_class.slots + assert schema.slots["id"].identifier + assert schema.slots["id"].required