diff --git a/docs/vector_store.ipynb b/docs/vector_store.ipynb index 8b4778c6..737a62f2 100644 --- a/docs/vector_store.ipynb +++ b/docs/vector_store.ipynb @@ -530,7 +530,7 @@ "metadata": {}, "outputs": [], "source": [ - "from langchain_google_alloydb_pg.indexes import IVFFlatIndex\n", + "from langchain_postgres.v2.indexes import IVFFlatIndex\n", "\n", "index = IVFFlatIndex()\n", "await store.aapply_vector_index(index)" diff --git a/pyproject.toml b/pyproject.toml index 48ea31c0..c3666e56 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,11 +11,8 @@ authors = [ dependencies = [ "google-cloud-alloydb-connector[asyncpg]>=1.2.0, <2.0.0", "google-cloud-storage>=2.18.2, <4.0.0", - "langchain-core>=0.2.36, <1.0.0", - "numpy>=1.24.4, <3.0.0; python_version > '3.9'", - "numpy>=1.24.4, <=2.0.2; python_version <= '3.9'", - "pgvector>=0.2.5, <1.0.0", - "SQLAlchemy[asyncio]>=2.0.25, <3.0.0" + "SQLAlchemy[asyncio]>=2.0.25, <3.0.0", + "langchain-postgres>=0.0.14rc1" ] classifiers = [ diff --git a/requirements.txt b/requirements.txt index 59b1498b..3e7e0a9c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,5 @@ google-cloud-alloydb-connector[asyncpg]==1.7.0 google-cloud-storage==3.1.0 -langchain-core==0.3.41 -numpy==2.2.3; python_version > "3.9" -numpy== 2.0.2; python_version <= "3.9" -pgvector==0.3.6 SQLAlchemy[asyncio]==2.0.38 -langgraph-checkpoint==2.0.19 \ No newline at end of file +langgraph-checkpoint==2.0.19 +langchain-postgres==0.0.14rc1 \ No newline at end of file diff --git a/samples/index_tuning_sample/index_search.py b/samples/index_tuning_sample/index_search.py index 12f24a6f..d1f0bc8a 100644 --- a/samples/index_tuning_sample/index_search.py +++ b/samples/index_tuning_sample/index_search.py @@ -32,12 +32,14 @@ vector_table_name, ) from langchain_google_vertexai import VertexAIEmbeddings - -from langchain_google_alloydb_pg import AlloyDBEngine, AlloyDBVectorStore -from langchain_google_alloydb_pg.indexes import ( +from langchain_postgres.v2.indexes import ( HNSWIndex, HNSWQueryOptions, IVFFlatIndex, +) + +from langchain_google_alloydb_pg import AlloyDBEngine, AlloyDBVectorStore +from langchain_google_alloydb_pg.indexes import ( IVFIndex, ScaNNIndex, ) diff --git a/samples/index_tuning_sample/requirements.txt b/samples/index_tuning_sample/requirements.txt index 81e53259..d1810bae 100644 --- a/samples/index_tuning_sample/requirements.txt +++ b/samples/index_tuning_sample/requirements.txt @@ -1,3 +1,4 @@ langchain-google-alloydb-pg==0.9.3 langchain==0.3.20 -langchain-google-vertexai==2.0.14 \ No newline at end of file +langchain-google-vertexai==2.0.14 +langchain-postgres==0.0.14rc1 \ No newline at end of file diff --git a/samples/requirements.txt b/samples/requirements.txt index 550946e3..27a0d571 100644 --- a/samples/requirements.txt +++ b/samples/requirements.txt @@ -3,3 +3,4 @@ google-cloud-resource-manager==1.14.1 langchain-community==0.3.19 langchain-google-alloydb-pg==0.9.3 langchain-google-vertexai==2.0.14 +langchain-postgres==0.0.14rc1 diff --git a/src/langchain_google_alloydb_pg/async_vectorstore.py b/src/langchain_google_alloydb_pg/async_vectorstore.py index eb77030e..73b421e7 100644 --- a/src/langchain_google_alloydb_pg/async_vectorstore.py +++ b/src/langchain_google_alloydb_pg/async_vectorstore.py @@ -16,362 +16,22 @@ from __future__ import annotations import base64 -import copy -import json import re -import uuid -from typing import Any, Callable, Iterable, Optional, Sequence +from typing import Any, Optional -import numpy as np import requests from google.cloud import storage # type: ignore from langchain_core.documents import Document from langchain_core.embeddings import Embeddings -from langchain_core.vectorstores import VectorStore, utils -from sqlalchemy import RowMapping, text -from sqlalchemy.ext.asyncio import AsyncEngine +from langchain_postgres.v2.async_vectorstore import AsyncPGVectorStore +from sqlalchemy import RowMapping, Sequence, text -from .engine import AlloyDBEngine -from .indexes import ( - DEFAULT_DISTANCE_STRATEGY, - DEFAULT_INDEX_NAME_SUFFIX, - BaseIndex, - DistanceStrategy, - ExactNearestNeighbor, - QueryOptions, - ScaNNIndex, -) -COMPARISONS_TO_NATIVE = { - "$eq": "=", - "$ne": "!=", - "$lt": "<", - "$lte": "<=", - "$gt": ">", - "$gte": ">=", -} - -SPECIAL_CASED_OPERATORS = { - "$in", - "$nin", - "$between", - "$exists", -} - -TEXT_OPERATORS = { - "$like", - "$ilike", -} - -LOGICAL_OPERATORS = {"$and", "$or", "$not"} - -SUPPORTED_OPERATORS = ( - set(COMPARISONS_TO_NATIVE) - .union(TEXT_OPERATORS) - .union(LOGICAL_OPERATORS) - .union(SPECIAL_CASED_OPERATORS) -) - - -class AsyncAlloyDBVectorStore(VectorStore): +class AsyncAlloyDBVectorStore(AsyncPGVectorStore): """Google AlloyDB Vector Store class""" - __create_key = object() - - def __init__( - self, - key: object, - engine: AsyncEngine, - embedding_service: Embeddings, - table_name: str, - schema_name: str = "public", - content_column: str = "content", - embedding_column: str = "embedding", - metadata_columns: list[str] = [], - id_column: str = "langchain_id", - metadata_json_column: Optional[str] = "langchain_metadata", - distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - index_query_options: Optional[QueryOptions] = None, - ): - """AsyncAlloyDBVectorStore constructor. - Args: - key (object): Prevent direct constructor usage. - engine (AlloyDBEngine): Connection pool engine for managing connections to AlloyDB database. - embedding_service (Embeddings): Text embedding model to use. - table_name (str): Name of the existing table or the table to be created. - schema_name (str, optional): Name of the database schema. Defaults to "public". - content_column (str): Column that represent a Document’s page_content. Defaults to "content". - embedding_column (str): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". - metadata_columns (list[str]): Column(s) that represent a document's metadata. - id_column (str): Column that represents the Document's id. Defaults to "langchain_id". - metadata_json_column (str): Column to store metadata as JSON. Defaults to "langchain_metadata". - distance_strategy (DistanceStrategy): Distance strategy to use for vector similarity search. Defaults to COSINE_DISTANCE. - k (int): Number of Documents to return from search. Defaults to 4. - fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. - lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. - index_query_options (QueryOptions): Index query option. - - - Raises: - Exception: If called directly by user. - """ - if key != AsyncAlloyDBVectorStore.__create_key: - raise Exception( - "Only create class through 'create' or 'create_sync' methods!" - ) - - self.engine = engine - self.embedding_service = embedding_service - self.table_name = table_name - self.schema_name = schema_name - self.content_column = content_column - self.embedding_column = embedding_column - self.metadata_columns = metadata_columns - self.id_column = id_column - self.metadata_json_column = metadata_json_column - self.distance_strategy = distance_strategy - self.k = k - self.fetch_k = fetch_k - self.lambda_mult = lambda_mult - self.index_query_options = index_query_options - - @classmethod - async def create( - cls: type[AsyncAlloyDBVectorStore], - engine: AlloyDBEngine, - embedding_service: Embeddings, - table_name: str, - schema_name: str = "public", - content_column: str = "content", - embedding_column: str = "embedding", - metadata_columns: list[str] = [], - ignore_metadata_columns: Optional[list[str]] = None, - id_column: str = "langchain_id", - metadata_json_column: Optional[str] = "langchain_metadata", - distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - index_query_options: Optional[QueryOptions] = None, - ) -> AsyncAlloyDBVectorStore: - """Create an AsyncAlloyDBVectorStore instance. - - Args: - engine (AlloyDBEngine): Connection pool engine for managing connections to AlloyDB database. - embedding_service (Embeddings): Text embedding model to use. - table_name (str): Name of an existing table. - schema_name (str, optional): Name of the database schema. Defaults to "public". - content_column (str): Column that represent a Document’s page_content. Defaults to "content". - embedding_column (str): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". - metadata_columns (list[str]): Column(s) that represent a document's metadata. - ignore_metadata_columns (list[str]): Column(s) to ignore in pre-existing tables for a document's metadata. Can not be used with metadata_columns. Defaults to None. - id_column (str): Column that represents the Document's id. Defaults to "langchain_id". - metadata_json_column (str): Column to store metadata as JSON. Defaults to "langchain_metadata". - distance_strategy (DistanceStrategy): Distance strategy to use for vector similarity search. Defaults to COSINE_DISTANCE. - k (int): Number of Documents to return from search. Defaults to 4. - fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. - lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. - index_query_options (QueryOptions): Index query option. - - Returns: - AsyncAlloyDBVectorStore - """ - if metadata_columns and ignore_metadata_columns: - raise ValueError( - "Can not use both metadata_columns and ignore_metadata_columns." - ) - # Get field type information - stmt = f"SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '{table_name}' AND table_schema = '{schema_name}'" - async with engine._pool.connect() as conn: - result = await conn.execute(text(stmt)) - result_map = result.mappings() - results = result_map.fetchall() - columns = {} - for field in results: - columns[field["column_name"]] = field["data_type"] - - # Check columns - if id_column not in columns: - raise ValueError(f"Id column, {id_column}, does not exist.") - if content_column not in columns: - raise ValueError(f"Content column, {content_column}, does not exist.") - content_type = columns[content_column] - if content_type != "text" and "char" not in content_type: - raise ValueError( - f"Content column, {content_column}, is type, {content_type}. It must be a type of character string." - ) - if embedding_column not in columns: - raise ValueError(f"Embedding column, {embedding_column}, does not exist.") - if columns[embedding_column] != "USER-DEFINED": - raise ValueError( - f"Embedding column, {embedding_column}, is not type Vector." - ) - - metadata_json_column = ( - None if metadata_json_column not in columns else metadata_json_column - ) - - # If using metadata_columns check to make sure column exists - for column in metadata_columns: - if column not in columns: - raise ValueError(f"Metadata column, {column}, does not exist.") - - # If using ignore_metadata_columns, filter out known columns and set known metadata columns - all_columns = columns - if ignore_metadata_columns: - for column in ignore_metadata_columns: - del all_columns[column] - - del all_columns[id_column] - del all_columns[content_column] - del all_columns[embedding_column] - metadata_columns = [k for k in all_columns.keys()] - - return cls( - cls.__create_key, - engine._pool, - embedding_service, - table_name, - schema_name=schema_name, - content_column=content_column, - embedding_column=embedding_column, - metadata_columns=metadata_columns, - id_column=id_column, - metadata_json_column=metadata_json_column, - distance_strategy=distance_strategy, - k=k, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - index_query_options=index_query_options, - ) - - @property - def embeddings(self) -> Embeddings: - return self.embedding_service - - async def aadd_embeddings( - self, - texts: Iterable[str], - embeddings: list[list[float]], - metadatas: Optional[list[dict]] = None, - ids: Optional[list] = None, - **kwargs: Any, - ) -> list[str]: - """Add data along with embeddings to the table. - - Raises: - :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. - """ - if not ids: - ids = [str(uuid.uuid4()) for _ in texts] - else: - # This is done to fill in any missing ids - ids = [id if id is not None else str(uuid.uuid4()) for id in ids] - if not metadatas: - metadatas = [{} for _ in texts] - # Insert embeddings - for id, content, embedding, metadata in zip(ids, texts, embeddings, metadatas): - metadata_col_names = ( - ", " + ", ".join(f'"{col}"' for col in self.metadata_columns) - if len(self.metadata_columns) > 0 - else "" - ) - insert_stmt = f'INSERT INTO "{self.schema_name}"."{self.table_name}"("{self.id_column}", "{self.content_column}", "{self.embedding_column}"{metadata_col_names}' - values = { - "id": id, - "content": content, - "embedding": str([float(dimension) for dimension in embedding]), - } - values_stmt = "VALUES (:id, :content, :embedding" - inline_embed_func = getattr( - self.embedding_service, "embed_query_inline", None - ) - if not embedding and callable(inline_embed_func): - values_stmt = f"VALUES (:id, :content, {self.embedding_service.embed_query_inline(content)}" # type: ignore - - # Add metadata - extra = copy.deepcopy(metadata) - for metadata_column in self.metadata_columns: - if metadata_column in metadata: - values_stmt += f", :{metadata_column}" - values[metadata_column] = metadata[metadata_column] - del extra[metadata_column] - else: - values_stmt += ",null" - - # Add JSON column and/or close statement - insert_stmt += ( - f""", "{self.metadata_json_column}")""" - if self.metadata_json_column - else ")" - ) - if self.metadata_json_column: - values_stmt += ", :extra)" - values["extra"] = json.dumps(extra) - else: - values_stmt += ")" - - upsert_stmt = f' ON CONFLICT ("{self.id_column}") DO UPDATE SET "{self.content_column}" = EXCLUDED."{self.content_column}", "{self.embedding_column}" = EXCLUDED."{self.embedding_column}"' - - if self.metadata_json_column: - upsert_stmt += f', "{self.metadata_json_column}" = EXCLUDED."{self.metadata_json_column}"' - - for column in self.metadata_columns: - upsert_stmt += f', "{column}" = EXCLUDED."{column}"' - - upsert_stmt += ";" - - query = insert_stmt + values_stmt + upsert_stmt - async with self.engine.connect() as conn: - await conn.execute(text(query), values) - await conn.commit() - - return ids - - async def aadd_texts( - self, - texts: Iterable[str], - metadatas: Optional[list[dict]] = None, - ids: Optional[list] = None, - **kwargs: Any, - ) -> list[str]: - """Embed texts and add to the table. - - Raises: - :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. - """ - # Check for inline embedding query - inline_embed_func = getattr(self.embedding_service, "embed_query_inline", None) - if callable(inline_embed_func): - embeddings: list[list[float]] = [[] for _ in list(texts)] - else: - embeddings = await self.embedding_service.aembed_documents(list(texts)) - - ids = await self.aadd_embeddings( - texts, embeddings, metadatas=metadatas, ids=ids, **kwargs - ) - return ids - - async def aadd_documents( - self, - documents: list[Document], - ids: Optional[list] = None, - **kwargs: Any, - ) -> list[str]: - """Embed documents and add to the table. - - Raises: - :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. - """ - texts = [doc.page_content for doc in documents] - metadatas = [doc.metadata for doc in documents] - if not ids: - ids = [doc.id for doc in documents] - ids = await self.aadd_texts(texts, metadatas=metadatas, ids=ids, **kwargs) - return ids + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def _encode_image(self, uri: str) -> str: """Get base64 string from a image URI.""" @@ -423,235 +83,6 @@ async def aadd_images( ) return ids - async def adelete( - self, - ids: Optional[list] = None, - **kwargs: Any, - ) -> Optional[bool]: - """Delete records from the table. - - Raises: - :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. - """ - if not ids: - return False - - id_list = ", ".join([f"'{id}'" for id in ids]) - query = f'DELETE FROM "{self.schema_name}"."{self.table_name}" WHERE {self.id_column} in ({id_list})' - async with self.engine.connect() as conn: - await conn.execute(text(query)) - await conn.commit() - return True - - @classmethod - async def afrom_texts( # type: ignore[override] - cls: type[AsyncAlloyDBVectorStore], - texts: list[str], - embedding: Embeddings, - engine: AlloyDBEngine, - table_name: str, - schema_name: str = "public", - metadatas: Optional[list[dict]] = None, - ids: Optional[list] = None, - content_column: str = "content", - embedding_column: str = "embedding", - metadata_columns: list[str] = [], - ignore_metadata_columns: Optional[list[str]] = None, - id_column: str = "langchain_id", - metadata_json_column: str = "langchain_metadata", - distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - index_query_options: Optional[QueryOptions] = None, - **kwargs: Any, - ) -> AsyncAlloyDBVectorStore: - """Create an AsyncAlloyDBVectorStore instance from texts. - - Args: - texts (list[str]): Texts to add to the vector store. - embedding (Embeddings): Text embedding model to use. - engine (AlloyDBEngine): Connection pool engine for managing connections to AlloyDB database. - table_name (str): Name of an existing table. - metadatas (Optional[list[dict]]): List of metadatas to add to table records. - ids: (Optional[list[str]]): List of IDs to add to table records. - content_column (str): Column that represent a Document’s page_content. Defaults to "content". - embedding_column (str): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". - metadata_columns (list[str]): Column(s) that represent a document's metadata. - ignore_metadata_columns (list[str]): Column(s) to ignore in pre-existing tables for a document's metadata. Can not be used with metadata_columns. Defaults to None. - id_column (str): Column that represents the Document's id. Defaults to "langchain_id". - metadata_json_column (str): Column to store metadata as JSON. Defaults to "langchain_metadata". - distance_strategy (DistanceStrategy): Distance strategy to use for vector similarity search. Defaults to COSINE_DISTANCE. - k (int): Number of Documents to return from search. Defaults to 4. - fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. - lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. - index_query_options (QueryOptions): Index query option. - - Raises: - :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. - - Returns: - AsyncAlloyDBVectorStore - """ - vs = await cls.create( - engine, - embedding, - table_name, - schema_name=schema_name, - content_column=content_column, - embedding_column=embedding_column, - metadata_columns=metadata_columns, - ignore_metadata_columns=ignore_metadata_columns, - id_column=id_column, - metadata_json_column=metadata_json_column, - distance_strategy=distance_strategy, - k=k, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - index_query_options=index_query_options, - ) - await vs.aadd_texts(texts, metadatas=metadatas, ids=ids, **kwargs) - return vs - - @classmethod - async def afrom_documents( # type: ignore[override] - cls: type[AsyncAlloyDBVectorStore], - documents: list[Document], - embedding: Embeddings, - engine: AlloyDBEngine, - table_name: str, - schema_name: str = "public", - ids: Optional[list] = None, - content_column: str = "content", - embedding_column: str = "embedding", - metadata_columns: list[str] = [], - ignore_metadata_columns: Optional[list[str]] = None, - id_column: str = "langchain_id", - metadata_json_column: str = "langchain_metadata", - distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - index_query_options: Optional[QueryOptions] = None, - **kwargs: Any, - ) -> AsyncAlloyDBVectorStore: - """Create an AsyncAlloyDBVectorStore instance from documents. - - Args: - documents (list[Document]): Documents to add to the vector store. - embedding (Embeddings): Text embedding model to use. - engine (AlloyDBEngine): Connection pool engine for managing connections to AlloyDB database. - table_name (str): Name of an existing table. - metadatas (Optional[list[dict]]): List of metadatas to add to table records. - ids: (Optional[list[str]]): List of IDs to add to table records. - content_column (str): Column that represent a Document’s page_content. Defaults to "content". - embedding_column (str): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". - metadata_columns (list[str]): Column(s) that represent a document's metadata. - ignore_metadata_columns (list[str]): Column(s) to ignore in pre-existing tables for a document's metadata. Can not be used with metadata_columns. Defaults to None. - id_column (str): Column that represents the Document's id. Defaults to "langchain_id". - metadata_json_column (str): Column to store metadata as JSON. Defaults to "langchain_metadata". - distance_strategy (DistanceStrategy): Distance strategy to use for vector similarity search. Defaults to COSINE_DISTANCE. - k (int): Number of Documents to return from search. Defaults to 4. - fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. - lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. - index_query_options (QueryOptions): Index query option. - - Raises: - :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. - - Returns: - AsyncAlloyDBVectorStore - """ - - vs = await cls.create( - engine, - embedding, - table_name, - schema_name=schema_name, - content_column=content_column, - embedding_column=embedding_column, - metadata_columns=metadata_columns, - ignore_metadata_columns=ignore_metadata_columns, - id_column=id_column, - metadata_json_column=metadata_json_column, - distance_strategy=distance_strategy, - k=k, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - index_query_options=index_query_options, - ) - texts = [doc.page_content for doc in documents] - metadatas = [doc.metadata for doc in documents] - await vs.aadd_texts(texts, metadatas=metadatas, ids=ids, **kwargs) - return vs - - async def __query_collection( - self, - embedding: list[float], - k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> Sequence[RowMapping]: - """Perform similarity search query on database.""" - k = k if k else self.k - operator = self.distance_strategy.operator - search_function = self.distance_strategy.search_function - - columns = self.metadata_columns + [ - self.id_column, - self.content_column, - self.embedding_column, - ] - if self.metadata_json_column: - columns.append(self.metadata_json_column) - - column_names = ", ".join(f'"{col}"' for col in columns) - - if filter and isinstance(filter, dict): - filter = self._create_filter_clause(filter) - filter = f"WHERE {filter}" if filter else "" - inline_embed_func = getattr(self.embedding_service, "embed_query_inline", None) - if not embedding and callable(inline_embed_func) and "query" in kwargs: - query_embedding = self.embedding_service.embed_query_inline(kwargs["query"]) # type: ignore - else: - query_embedding = f"'{[float(dimension) for dimension in embedding]}'" - stmt = f'SELECT {column_names}, {search_function}({self.embedding_column}, {query_embedding}) as distance FROM "{self.schema_name}"."{self.table_name}" {filter} ORDER BY {self.embedding_column} {operator} {query_embedding} LIMIT {k};' - if self.index_query_options: - async with self.engine.connect() as conn: - # Set each query option individually - for query_option in self.index_query_options.to_parameter(): - query_options_stmt = f"SET LOCAL {query_option};" - await conn.execute(text(query_options_stmt)) - result = await conn.execute(text(stmt)) - result_map = result.mappings() - results = result_map.fetchall() - else: - async with self.engine.connect() as conn: - result = await conn.execute(text(stmt)) - result_map = result.mappings() - results = result_map.fetchall() - return results - - async def asimilarity_search( - self, - query: str, - k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[Document]: - """Return docs selected by similarity search on query.""" - inline_embed_func = getattr(self.embedding_service, "embed_query_inline", None) - embedding = ( - [] - if callable(inline_embed_func) - else await self.embedding_service.aembed_query(text=query) - ) - kwargs["query"] = query - - return await self.asimilarity_search_by_vector( - embedding=embedding, k=k, filter=filter, **kwargs - ) - def _images_embedding_helper(self, image_uris: list[str]) -> list[list[float]]: # check if either `embed_images()` or `embed_image()` API is supported by the embedding service used if hasattr(self.embedding_service, "embed_images"): @@ -678,7 +109,7 @@ async def asimilarity_search_image( self, image_uri: str, k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, + filter: Optional[dict] = None, **kwargs: Any, ) -> list[Document]: """Return docs selected by similarity search on query.""" @@ -688,177 +119,6 @@ async def asimilarity_search_image( embedding=embedding, k=k, filter=filter, **kwargs ) - def _select_relevance_score_fn(self) -> Callable[[float], float]: - """Select a relevance function based on distance strategy.""" - # Calculate distance strategy provided in - # vectorstore constructor - if self.distance_strategy == DistanceStrategy.COSINE_DISTANCE: - return self._cosine_relevance_score_fn - if self.distance_strategy == DistanceStrategy.INNER_PRODUCT: - return self._max_inner_product_relevance_score_fn - elif self.distance_strategy == DistanceStrategy.EUCLIDEAN: - return self._euclidean_relevance_score_fn - - async def asimilarity_search_with_score( - self, - query: str, - k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[tuple[Document, float]]: - """Return docs and distance scores selected by similarity search on query.""" - inline_embed_func = getattr(self.embedding_service, "embed_query_inline", None) - embedding = ( - [] - if callable(inline_embed_func) - else await self.embedding_service.aembed_query(text=query) - ) - kwargs["query"] = query - - docs = await self.asimilarity_search_with_score_by_vector( - embedding=embedding, k=k, filter=filter, **kwargs - ) - return docs - - async def asimilarity_search_by_vector( - self, - embedding: list[float], - k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[Document]: - """Return docs selected by vector similarity search.""" - docs_and_scores = await self.asimilarity_search_with_score_by_vector( - embedding=embedding, k=k, filter=filter, **kwargs - ) - - return [doc for doc, _ in docs_and_scores] - - async def asimilarity_search_with_score_by_vector( - self, - embedding: list[float], - k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[tuple[Document, float]]: - """Return docs and distance scores selected by vector similarity search.""" - results = await self.__query_collection( - embedding=embedding, k=k, filter=filter, **kwargs - ) - - documents_with_scores = [] - for row in results: - metadata = ( - row[self.metadata_json_column] - if self.metadata_json_column and row[self.metadata_json_column] - else {} - ) - for col in self.metadata_columns: - metadata[col] = row[col] - documents_with_scores.append( - ( - Document( - page_content=row[self.content_column], - metadata=metadata, - id=str(row[self.id_column]), - ), - row["distance"], - ) - ) - - return documents_with_scores - - async def amax_marginal_relevance_search( - self, - query: str, - k: Optional[int] = None, - fetch_k: Optional[int] = None, - lambda_mult: Optional[float] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[Document]: - """Return docs selected using the maximal marginal relevance.""" - embedding = await self.embedding_service.aembed_query(text=query) - - return await self.amax_marginal_relevance_search_by_vector( - embedding=embedding, - k=k, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - filter=filter, - **kwargs, - ) - - async def amax_marginal_relevance_search_by_vector( - self, - embedding: list[float], - k: Optional[int] = None, - fetch_k: Optional[int] = None, - lambda_mult: Optional[float] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[Document]: - """Return docs selected using the maximal marginal relevance.""" - docs_and_scores = ( - await self.amax_marginal_relevance_search_with_score_by_vector( - embedding, - k=k, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - filter=filter, - **kwargs, - ) - ) - - return [result[0] for result in docs_and_scores] - - async def amax_marginal_relevance_search_with_score_by_vector( - self, - embedding: list[float], - k: Optional[int] = None, - fetch_k: Optional[int] = None, - lambda_mult: Optional[float] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[tuple[Document, float]]: - """Return docs and distance scores selected using the maximal marginal relevance.""" - results = await self.__query_collection( - embedding=embedding, k=fetch_k, filter=filter, **kwargs - ) - - k = k if k else self.k - fetch_k = fetch_k if fetch_k else self.fetch_k - lambda_mult = lambda_mult if lambda_mult else self.lambda_mult - embedding_list = [json.loads(row[self.embedding_column]) for row in results] - mmr_selected = utils.maximal_marginal_relevance( - np.array(embedding, dtype=np.float32), - embedding_list, - k=k, - lambda_mult=lambda_mult, - ) - - documents_with_scores = [] - for row in results: - metadata = ( - row[self.metadata_json_column] - if self.metadata_json_column and row[self.metadata_json_column] - else {} - ) - for col in self.metadata_columns: - metadata[col] = row[col] - documents_with_scores.append( - ( - Document( - page_content=row[self.content_column], - metadata=metadata, - id=str(row[self.id_column]), - ), - row["distance"], - ) - ) - - return [r for i, r in enumerate(documents_with_scores) if i in mmr_selected] - async def set_maintenance_work_mem(self, num_leaves: int, vector_size: int) -> None: """Set database maintenance work memory (for ScaNN index creation).""" # Required index memory in MB @@ -871,339 +131,6 @@ async def set_maintenance_work_mem(self, num_leaves: int, vector_size: int) -> N await conn.execute(text(query)) await conn.commit() - async def aapply_vector_index( - self, - index: BaseIndex, - name: Optional[str] = None, - concurrently: bool = False, - ) -> None: - """Create index in the vector store table.""" - if isinstance(index, ExactNearestNeighbor): - await self.adrop_vector_index() - return - - # if extension name is mentioned, create the extension - if index.extension_name: - async with self.engine.connect() as conn: - await conn.execute( - text(f"CREATE EXTENSION IF NOT EXISTS {index.extension_name}") - ) - await conn.commit() - function = index.get_index_function() - - filter = f"WHERE ({index.partial_indexes})" if index.partial_indexes else "" - params = "WITH " + index.index_options() - if name is None: - if index.name == None: - index.name = self.table_name + DEFAULT_INDEX_NAME_SUFFIX - name = index.name - stmt = f"CREATE INDEX {'CONCURRENTLY' if concurrently else ''} {name} ON \"{self.schema_name}\".\"{self.table_name}\" USING {index.index_type} ({self.embedding_column} {function}) {params} {filter};" - if concurrently: - async with self.engine.connect() as conn: - await conn.execute(text("COMMIT")) - await conn.execute(text(stmt)) - else: - async with self.engine.connect() as conn: - await conn.execute(text(stmt)) - await conn.commit() - - async def areindex(self, index_name: Optional[str] = None) -> None: - """Re-index the vector store table.""" - index_name = index_name or self.table_name + DEFAULT_INDEX_NAME_SUFFIX - query = f"REINDEX INDEX {index_name};" - async with self.engine.connect() as conn: - await conn.execute(text(query)) - await conn.commit() - - async def adrop_vector_index( - self, - index_name: Optional[str] = None, - ) -> None: - """Drop the vector index.""" - index_name = index_name or self.table_name + DEFAULT_INDEX_NAME_SUFFIX - query = f"DROP INDEX IF EXISTS {index_name};" - async with self.engine.connect() as conn: - await conn.execute(text(query)) - await conn.commit() - - async def is_valid_index( - self, - index_name: Optional[str] = None, - ) -> bool: - """Check if index exists in the table.""" - index_name = index_name or self.table_name + DEFAULT_INDEX_NAME_SUFFIX - query = f""" - SELECT tablename, indexname - FROM pg_indexes - WHERE tablename = '{self.table_name}' AND schemaname = '{self.schema_name}' AND indexname = '{index_name}'; - """ - async with self.engine.connect() as conn: - result = await conn.execute(text(query)) - result_map = result.mappings() - results = result_map.fetchall() - return bool(len(results) == 1) - - async def aget_by_ids(self, ids: Sequence[str]) -> list[Document]: - """Get documents by ids.""" - - quoted_ids = [f"'{id_val}'" for id_val in ids] - id_list_str = ", ".join(quoted_ids) - - columns = self.metadata_columns + [ - self.id_column, - self.content_column, - ] - if self.metadata_json_column: - columns.append(self.metadata_json_column) - - column_names = ", ".join(f'"{col}"' for col in columns) - - query = f'SELECT {column_names} FROM "{self.schema_name}"."{self.table_name}" WHERE "{self.id_column}" IN ({id_list_str});' - - async with self.engine.connect() as conn: - result = await conn.execute(text(query)) - result_map = result.mappings() - results = result_map.fetchall() - - documents = [] - for row in results: - metadata = ( - row[self.metadata_json_column] - if self.metadata_json_column and row[self.metadata_json_column] - else {} - ) - for col in self.metadata_columns: - metadata[col] = row[col] - documents.append( - ( - Document( - page_content=row[self.content_column], - metadata=metadata, - id=str(row[self.id_column]), - ) - ) - ) - - return documents - - def _handle_field_filter( - self, - field: str, - value: Any, - ) -> str: - """Create a filter for a specific field. - - Args: - field: name of field - value: value to filter - If provided as is then this will be an equality filter - If provided as a dictionary then this will be a filter, the key - will be the operator and the value will be the value to filter by - - Returns: - sql where query as a string - """ - if not isinstance(field, str): - raise ValueError( - f"field should be a string but got: {type(field)} with value: {field}" - ) - - if field.startswith("$"): - raise ValueError( - f"Invalid filter condition. Expected a field but got an operator: " - f"{field}" - ) - - # Allow [a-zA-Z0-9_], disallow $ for now until we support escape characters - if not field.isidentifier(): - raise ValueError( - f"Invalid field name: {field}. Expected a valid identifier." - ) - - if isinstance(value, dict): - # This is a filter specification - if len(value) != 1: - raise ValueError( - "Invalid filter condition. Expected a value which " - "is a dictionary with a single key that corresponds to an operator " - f"but got a dictionary with {len(value)} keys. The first few " - f"keys are: {list(value.keys())[:3]}" - ) - operator, filter_value = list(value.items())[0] - # Verify that that operator is an operator - if operator not in SUPPORTED_OPERATORS: - raise ValueError( - f"Invalid operator: {operator}. " - f"Expected one of {SUPPORTED_OPERATORS}" - ) - else: # Then we assume an equality operator - operator = "$eq" - filter_value = value - - if operator in COMPARISONS_TO_NATIVE: - # Then we implement an equality filter - # native is trusted input - if isinstance(filter_value, str): - filter_value = f"'{filter_value}'" - native = COMPARISONS_TO_NATIVE[operator] - return f"({field} {native} {filter_value})" - elif operator == "$between": - # Use AND with two comparisons - low, high = filter_value - - return f"({field} BETWEEN {low} AND {high})" - elif operator in {"$in", "$nin", "$like", "$ilike"}: - # We'll do force coercion to text - if operator in {"$in", "$nin"}: - for val in filter_value: - if not isinstance(val, (str, int, float)): - raise NotImplementedError( - f"Unsupported type: {type(val)} for value: {val}" - ) - - if isinstance(val, bool): # b/c bool is an instance of int - raise NotImplementedError( - f"Unsupported type: {type(val)} for value: {val}" - ) - - if operator in {"$in"}: - values = str(tuple(val for val in filter_value)) - return f"({field} IN {values})" - elif operator in {"$nin"}: - values = str(tuple(val for val in filter_value)) - return f"({field} NOT IN {values})" - elif operator in {"$like"}: - return f"({field} LIKE '{filter_value}')" - elif operator in {"$ilike"}: - return f"({field} ILIKE '{filter_value}')" - else: - raise NotImplementedError() - elif operator == "$exists": - if not isinstance(filter_value, bool): - raise ValueError( - "Expected a boolean value for $exists " - f"operator, but got: {filter_value}" - ) - else: - if filter_value: - return f"({field} IS NOT NULL)" - else: - return f"({field} IS NULL)" - else: - raise NotImplementedError() - - def _create_filter_clause(self, filters: Any) -> str: - """Create LangChain filter representation to matching SQL where clauses - - Args: - filters: Dictionary of filters to apply to the query. - - Returns: - String containing the sql where query. - """ - - if not isinstance(filters, dict): - raise ValueError( - f"Invalid type: Expected a dictionary but got type: {type(filters)}" - ) - if len(filters) == 1: - # The only operators allowed at the top level are $AND, $OR, and $NOT - # First check if an operator or a field - key, value = list(filters.items())[0] - if key.startswith("$"): - # Then it's an operator - if key.lower() not in ["$and", "$or", "$not"]: - raise ValueError( - f"Invalid filter condition. Expected $and, $or or $not " - f"but got: {key}" - ) - else: - # Then it's a field - return self._handle_field_filter(key, filters[key]) - - if key.lower() == "$and" or key.lower() == "$or": - if not isinstance(value, list): - raise ValueError( - f"Expected a list, but got {type(value)} for value: {value}" - ) - op = key[1:].upper() # Extract the operator - filter_clause = [self._create_filter_clause(el) for el in value] - if len(filter_clause) > 1: - return f"({f' {op} '.join(filter_clause)})" - elif len(filter_clause) == 1: - return filter_clause[0] - else: - raise ValueError( - "Invalid filter condition. Expected a dictionary " - "but got an empty dictionary" - ) - elif key.lower() == "$not": - if isinstance(value, list): - not_conditions = [ - self._create_filter_clause(item) for item in value - ] - not_stmts = [f"NOT {condition}" for condition in not_conditions] - return f"({' AND '.join(not_stmts)})" - elif isinstance(value, dict): - not_ = self._create_filter_clause(value) - return f"(NOT {not_})" - else: - raise ValueError( - f"Invalid filter condition. Expected a dictionary " - f"or a list but got: {type(value)}" - ) - else: - raise ValueError( - f"Invalid filter condition. Expected $and, $or or $not " - f"but got: {key}" - ) - elif len(filters) > 1: - # Then all keys have to be fields (they cannot be operators) - for key in filters.keys(): - if key.startswith("$"): - raise ValueError( - f"Invalid filter condition. Expected a field but got: {key}" - ) - # These should all be fields and combined using an $and operator - and_ = [self._handle_field_filter(k, v) for k, v in filters.items()] - if len(and_) > 1: - return f"({' AND '.join(and_)})" - elif len(and_) == 1: - return and_[0] - else: - raise ValueError( - "Invalid filter condition. Expected a dictionary " - "but got an empty dictionary" - ) - else: - return "" - - def get_by_ids(self, ids: Sequence[str]) -> list[Document]: - raise NotImplementedError( - "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead." - ) - - def add_texts( - self, - texts: Iterable[str], - metadatas: Optional[list[dict]] = None, - ids: Optional[list] = None, - **kwargs: Any, - ) -> list[str]: - raise NotImplementedError( - "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead." - ) - - def add_documents( - self, - documents: list[Document], - ids: Optional[list] = None, - **kwargs: Any, - ) -> list[str]: - raise NotImplementedError( - "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead." - ) - def add_images( self, uris: list[str], @@ -1215,146 +142,13 @@ def add_images( "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead." ) - def delete( - self, - ids: Optional[list] = None, - **kwargs: Any, - ) -> Optional[bool]: - raise NotImplementedError( - "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead." - ) - - @classmethod - def from_texts( # type: ignore[override] - cls: type[AsyncAlloyDBVectorStore], - texts: list[str], - embedding: Embeddings, - engine: AlloyDBEngine, - table_name: str, - metadatas: Optional[list[dict]] = None, - ids: Optional[list] = None, - content_column: str = "content", - embedding_column: str = "embedding", - metadata_columns: list[str] = [], - ignore_metadata_columns: Optional[list[str]] = None, - id_column: str = "langchain_id", - metadata_json_column: str = "langchain_metadata", - **kwargs: Any, - ) -> AsyncAlloyDBVectorStore: - raise NotImplementedError( - "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead." - ) - - @classmethod - def from_documents( # type: ignore[override] - cls: type[AsyncAlloyDBVectorStore], - documents: list[Document], - embedding: Embeddings, - engine: AlloyDBEngine, - table_name: str, - ids: Optional[list] = None, - content_column: str = "content", - embedding_column: str = "embedding", - metadata_columns: list[str] = [], - ignore_metadata_columns: Optional[list[str]] = None, - id_column: str = "langchain_id", - metadata_json_column: str = "langchain_metadata", - **kwargs: Any, - ) -> AsyncAlloyDBVectorStore: - raise NotImplementedError( - "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead." - ) - - def similarity_search( - self, - query: str, - k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[Document]: - raise NotImplementedError( - "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead." - ) - def similarity_search_image( self, image_uri: str, k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[Document]: - raise NotImplementedError( - "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead." - ) - - def similarity_search_with_score( - self, - query: str, - k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[tuple[Document, float]]: - raise NotImplementedError( - "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead." - ) - - def similarity_search_by_vector( - self, - embedding: list[float], - k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[Document]: - raise NotImplementedError( - "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead." - ) - - def similarity_search_with_score_by_vector( - self, - embedding: list[float], - k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[tuple[Document, float]]: - raise NotImplementedError( - "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead." - ) - - def max_marginal_relevance_search( - self, - query: str, - k: Optional[int] = None, - fetch_k: Optional[int] = None, - lambda_mult: Optional[float] = None, - filter: Optional[dict] | Optional[str] = None, + filter: Optional[dict] = None, **kwargs: Any, ) -> list[Document]: raise NotImplementedError( "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead." ) - - def max_marginal_relevance_search_by_vector( - self, - embedding: list[float], - k: Optional[int] = None, - fetch_k: Optional[int] = None, - lambda_mult: Optional[float] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[Document]: - raise NotImplementedError( - "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead." - ) - - def max_marginal_relevance_search_with_score_by_vector( - self, - embedding: list[float], - k: Optional[int] = None, - fetch_k: Optional[int] = None, - lambda_mult: Optional[float] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[tuple[Document, float]]: - raise NotImplementedError( - "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead." - ) diff --git a/src/langchain_google_alloydb_pg/engine.py b/src/langchain_google_alloydb_pg/engine.py index 48799baf..c2b32954 100644 --- a/src/langchain_google_alloydb_pg/engine.py +++ b/src/langchain_google_alloydb_pg/engine.py @@ -35,10 +35,11 @@ IPTypes, RefreshStrategy, ) -from sqlalchemy import MetaData, RowMapping, Table, text +from langchain_postgres import Column, PGEngine +from sqlalchemy import MetaData, Table, text from sqlalchemy.engine import URL from sqlalchemy.exc import InvalidRequestError -from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine +from sqlalchemy.ext.asyncio import create_async_engine from .version import __version__ @@ -90,60 +91,10 @@ async def _get_iam_principal_email( return email.replace(".gserviceaccount.com", "") -@dataclass -class Column: - name: str - data_type: str - nullable: bool = True - - def __post_init__(self) -> None: - """Check if initialization parameters are valid. - - Raises: - ValueError: If Column name is not string. - ValueError: If data_type is not type string. - """ - - if not isinstance(self.name, str): - raise ValueError("Column name must be type string") - if not isinstance(self.data_type, str): - raise ValueError("Column data_type must be type string") - - -class AlloyDBEngine: +class AlloyDBEngine(PGEngine): """A class for managing connections to a AlloyDB database.""" _connector: Optional[AsyncConnector] = None - _default_loop: Optional[asyncio.AbstractEventLoop] = None - _default_thread: Optional[Thread] = None - __create_key = object() - - def __init__( - self, - key: object, - pool: AsyncEngine, - loop: Optional[asyncio.AbstractEventLoop], - thread: Optional[Thread], - ) -> None: - """AlloyDBEngine constructor. - - Args: - key (object): Prevent direct constructor usage. - engine (AsyncEngine): Async engine connection pool. - loop (Optional[asyncio.AbstractEventLoop]): Async event loop used to create the engine. - thread (Optional[Thread]): Thread used to create the engine async. - - Raises: - Exception: If the constructor is called directly by the user. - """ - - if key != AlloyDBEngine.__create_key: - raise Exception( - "Only create class through 'create' or 'create_sync' methods!" - ) - self._pool = pool - self._loop = loop - self._thread = thread @classmethod def __start_background_loop( @@ -317,7 +268,7 @@ async def getconn() -> asyncpg.Connection: async_creator=getconn, **engine_args, ) - return cls(cls.__create_key, engine, loop, thread) + return cls(PGEngine._PGEngine__create_key, engine, loop, thread) # type: ignore @classmethod async def afrom_instance( @@ -367,13 +318,24 @@ async def afrom_instance( return await asyncio.wrap_future(future) @classmethod - def from_engine( - cls: type[AlloyDBEngine], - engine: AsyncEngine, - loop: Optional[asyncio.AbstractEventLoop] = None, + def from_connection_string( + cls, + url: str | URL, + **kwargs: Any, ) -> AlloyDBEngine: - """Create an AlloyDBEngine instance from an AsyncEngine.""" - return cls(cls.__create_key, engine, loop, None) + """Create an AlloyDBEngine instance from arguments + + Args: + url (Optional[str]): the URL used to connect to a database. Use url or set other arguments. + + Raises: + ValueError: If not all database url arguments are specified + + Returns: + AlloyDBEngine + """ + + return AlloyDBEngine.from_engine_args(url=url, **kwargs) @classmethod def from_engine_args( @@ -408,197 +370,7 @@ def from_engine_args( raise ValueError("Driver must be type 'postgresql+asyncpg'") engine = create_async_engine(url, **kwargs) - return cls(cls.__create_key, engine, cls._default_loop, cls._default_thread) - - async def _run_as_async(self, coro: Awaitable[T]) -> T: - """Run an async coroutine asynchronously""" - # If a loop has not been provided, attempt to run in current thread - if not self._loop: - return await coro - # Otherwise, run in the background thread - return await asyncio.wrap_future( - asyncio.run_coroutine_threadsafe(coro, self._loop) - ) - - def _run_as_sync(self, coro: Awaitable[T]) -> T: - """Run an async coroutine synchronously""" - if not self._loop: - raise Exception( - "Engine was initialized without a background loop and cannot call sync methods." - ) - return asyncio.run_coroutine_threadsafe(coro, self._loop).result() - - async def close(self) -> None: - """Dispose of connection pool""" - await self._run_as_async(self._pool.dispose()) - - async def _ainit_vectorstore_table( - self, - table_name: str, - vector_size: int, - schema_name: str = "public", - content_column: str = "content", - embedding_column: str = "embedding", - metadata_columns: list[Column] = [], - metadata_json_column: str = "langchain_metadata", - id_column: Union[str, Column] = "langchain_id", - overwrite_existing: bool = False, - store_metadata: bool = True, - ) -> None: - """ - Create a table for saving of vectors to be used with AlloyDBVectorStore. - - Args: - table_name (str): The Postgres database table name. - vector_size (int): Vector size for the embedding model to be used. - schema_name (str): The schema name. - Default: "public". - content_column (str): Name of the column to store document content. - Default: "page_content". - embedding_column (str) : Name of the column to store vector embeddings. - Default: "embedding". - metadata_columns (list[Column]): A list of Columns to create for custom - metadata. Default: []. Optional. - metadata_json_column (str): The column to store extra metadata in JSON format. - Default: "langchain_metadata". Optional. - id_column (Union[str, Column]) : Column to store ids. - Default: "langchain_id" column name with data type UUID. Optional. - overwrite_existing (bool): Whether to drop existing table. Default: False. - store_metadata (bool): Whether to store metadata in the table. - Default: True. - - Raises: - :class:`DuplicateTableError `: if table already exists. - :class:`UndefinedObjectError `: if the data type of the id column is not a postgreSQL data type. - """ - async with self._pool.connect() as conn: - await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector")) - await conn.commit() - - if overwrite_existing: - async with self._pool.connect() as conn: - await conn.execute( - text(f'DROP TABLE IF EXISTS "{schema_name}"."{table_name}"') - ) - await conn.commit() - - id_data_type = "UUID" if isinstance(id_column, str) else id_column.data_type - id_column_name = id_column if isinstance(id_column, str) else id_column.name - - query = f"""CREATE TABLE "{schema_name}"."{table_name}"( - "{id_column_name}" {id_data_type} PRIMARY KEY, - "{content_column}" TEXT NOT NULL, - "{embedding_column}" vector({vector_size}) NOT NULL""" - for column in metadata_columns: - nullable = "NOT NULL" if not column.nullable else "" - query += f',\n"{column.name}" {column.data_type} {nullable}' - if store_metadata: - query += f""",\n"{metadata_json_column}" JSON""" - query += "\n);" - - async with self._pool.connect() as conn: - await conn.execute(text(query)) - await conn.commit() - - async def ainit_vectorstore_table( - self, - table_name: str, - vector_size: int, - schema_name: str = "public", - content_column: str = "content", - embedding_column: str = "embedding", - metadata_columns: list[Column] = [], - metadata_json_column: str = "langchain_metadata", - id_column: Union[str, Column] = "langchain_id", - overwrite_existing: bool = False, - store_metadata: bool = True, - ) -> None: - """ - Create a table for saving of vectors to be used with AlloyDBVectorStore. - - Args: - table_name (str): The database table name. - vector_size (int): Vector size for the embedding model to be used. - schema_name (str): The schema name. - Default: "public". - content_column (str): Name of the column to store document content. - Default: "page_content". - embedding_column (str) : Name of the column to store vector embeddings. - Default: "embedding". - metadata_columns (list[Column]): A list of Columns to create for custom - metadata. Default: []. Optional. - metadata_json_column (str): The column to store extra metadata in JSON format. - Default: "langchain_metadata". Optional. - id_column (Union[str, Column]) : Column to store ids. - Default: "langchain_id" column name with data type UUID. Optional. - overwrite_existing (bool): Whether to drop existing table. Default: False. - store_metadata (bool): Whether to store metadata in the table. - Default: True. - """ - await self._run_as_async( - self._ainit_vectorstore_table( - table_name, - vector_size, - schema_name, - content_column, - embedding_column, - metadata_columns, - metadata_json_column, - id_column, - overwrite_existing, - store_metadata, - ) - ) - - def init_vectorstore_table( - self, - table_name: str, - vector_size: int, - schema_name: str = "public", - content_column: str = "content", - embedding_column: str = "embedding", - metadata_columns: list[Column] = [], - metadata_json_column: str = "langchain_metadata", - id_column: Union[str, Column] = "langchain_id", - overwrite_existing: bool = False, - store_metadata: bool = True, - ) -> None: - """ - Create a table for saving of vectors to be used with AlloyDBVectorStore. - - Args: - table_name (str): The database table name. - vector_size (int): Vector size for the embedding model to be used. - schema_name (str): The schema name. - Default: "public". - content_column (str): Name of the column to store document content. - Default: "page_content". - embedding_column (str) : Name of the column to store vector embeddings. - Default: "embedding". - metadata_columns (list[Column]): A list of Columns to create for custom - metadata. Default: []. Optional. - metadata_json_column (str): The column to store extra metadata in JSON format. - Default: "langchain_metadata". Optional. - id_column (Union[str, Column]) : Column to store ids. - Default: "langchain_id" column name with data type UUID. Optional. - overwrite_existing (bool): Whether to drop existing table. Default: False. - store_metadata (bool): Whether to store metadata in the table. - Default: True. - """ - self._run_as_sync( - self._ainit_vectorstore_table( - table_name, - vector_size, - schema_name, - content_column, - embedding_column, - metadata_columns, - metadata_json_column, - id_column, - overwrite_existing, - store_metadata, - ) - ) + return cls(PGEngine._PGEngine__create_key, engine, cls._default_loop, cls._default_thread) # type: ignore async def _ainit_chat_history_table( self, table_name: str, schema_name: str = "public" diff --git a/src/langchain_google_alloydb_pg/indexes.py b/src/langchain_google_alloydb_pg/indexes.py index 051de869..add2c75e 100644 --- a/src/langchain_google_alloydb_pg/indexes.py +++ b/src/langchain_google_alloydb_pg/indexes.py @@ -12,124 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import enum import warnings -from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import Optional - -@dataclass -class StrategyMixin: - operator: str - search_function: str - index_function: str - - -class DistanceStrategy(StrategyMixin, enum.Enum): - """Enumerator of the Distance strategies.""" - - EUCLIDEAN = "<->", "l2_distance", "vector_l2_ops" - COSINE_DISTANCE = "<=>", "cosine_distance", "vector_cosine_ops" - INNER_PRODUCT = "<#>", "inner_product", "vector_ip_ops" - - -DEFAULT_DISTANCE_STRATEGY: DistanceStrategy = DistanceStrategy.COSINE_DISTANCE -DEFAULT_INDEX_NAME_SUFFIX: str = "langchainvectorindex" - - -@dataclass -class BaseIndex(ABC): - name: Optional[str] = None - index_type: str = "base" - distance_strategy: DistanceStrategy = field( - default_factory=lambda: DistanceStrategy.COSINE_DISTANCE - ) - partial_indexes: Optional[list[str]] = None - extension_name: Optional[str] = None - - @abstractmethod - def index_options(self) -> str: - """Set index query options for vector store initialization.""" - raise NotImplementedError( - "index_options method must be implemented by subclass" - ) - - def get_index_function(self) -> str: - return self.distance_strategy.index_function - - -@dataclass -class ExactNearestNeighbor(BaseIndex): - index_type: str = "exactnearestneighbor" - - -@dataclass -class QueryOptions(ABC): - @abstractmethod - def to_parameter(self) -> list[str]: - """Convert index attributes to list of configurations.""" - raise NotImplementedError("to_parameter method must be implemented by subclass") - - @abstractmethod - def to_string(self) -> str: - """Convert index attributes to string.""" - raise NotImplementedError("to_string method must be implemented by subclass") - - -@dataclass -class HNSWIndex(BaseIndex): - index_type: str = "hnsw" - m: int = 16 - ef_construction: int = 64 - - def index_options(self) -> str: - """Set index query options for vector store initialization.""" - return f"(m = {self.m}, ef_construction = {self.ef_construction})" - - -@dataclass -class HNSWQueryOptions(QueryOptions): - ef_search: int = 40 - - def to_parameter(self) -> list[str]: - """Convert index attributes to list of configurations.""" - return [f"hnsw.ef_search = {self.ef_search}"] - - def to_string(self) -> str: - """Convert index attributes to string.""" - warnings.warn( - "to_string is deprecated, use to_parameter instead.", - DeprecationWarning, - ) - return f"hnsw.ef_search = {self.ef_search}" - - -@dataclass -class IVFFlatIndex(BaseIndex): - index_type: str = "ivfflat" - lists: int = 100 - - def index_options(self) -> str: - """Set index query options for vector store initialization.""" - return f"(lists = {self.lists})" - - -@dataclass -class IVFFlatQueryOptions(QueryOptions): - probes: int = 1 - - def to_parameter(self) -> list[str]: - """Convert index attributes to list of configurations.""" - return [f"ivfflat.probes = {self.probes}"] - - def to_string(self) -> str: - """Convert index attributes to string.""" - warnings.warn( - "to_string is deprecated, use to_parameter instead.", - DeprecationWarning, - ) - return f"ivfflat.probes = {self.probes}" +from langchain_postgres.v2.indexes import BaseIndex, DistanceStrategy, QueryOptions @dataclass diff --git a/src/langchain_google_alloydb_pg/vectorstore.py b/src/langchain_google_alloydb_pg/vectorstore.py index a4d9d0a0..d43c05b7 100644 --- a/src/langchain_google_alloydb_pg/vectorstore.py +++ b/src/langchain_google_alloydb_pg/vectorstore.py @@ -15,56 +15,38 @@ # TODO: Remove below import when minimum supported Python version is 3.10 from __future__ import annotations -from typing import Any, Callable, Iterable, Optional, Sequence +from typing import Any, Optional from langchain_core.documents import Document from langchain_core.embeddings import Embeddings -from langchain_core.vectorstores import VectorStore - -from .async_vectorstore import AsyncAlloyDBVectorStore -from .engine import AlloyDBEngine -from .indexes import ( +from langchain_postgres import PGEngine, PGVectorStore +from langchain_postgres.v2.indexes import ( DEFAULT_DISTANCE_STRATEGY, - BaseIndex, DistanceStrategy, QueryOptions, ) +from .async_vectorstore import AsyncAlloyDBVectorStore +from .engine import AlloyDBEngine + -class AlloyDBVectorStore(VectorStore): +class AlloyDBVectorStore(PGVectorStore): """Google AlloyDB Vector Store class""" __create_key = object() - - def __init__(self, key: object, engine: AlloyDBEngine, vs: AsyncAlloyDBVectorStore): - """AlloyDBVectorStore constructor. - Args: - key (object): Prevent direct constructor usage. - engine (AlloyDBEngine): Connection pool engine for managing connections to Postgres database. - vs (AsyncAlloyDBVectorstore): The async only VectorStore implementation - - - Raises: - Exception: If called directly by user. - """ - if key != AlloyDBVectorStore.__create_key: - raise Exception( - "Only create class through 'create' or 'create_sync' methods!" - ) - - self._engine = engine - self.__vs = vs + _engine: AlloyDBEngine + __vs: AsyncAlloyDBVectorStore @classmethod async def create( cls: type[AlloyDBVectorStore], - engine: AlloyDBEngine, + engine: AlloyDBEngine, # type: ignore embedding_service: Embeddings, table_name: str, schema_name: str = "public", content_column: str = "content", embedding_column: str = "embedding", - metadata_columns: list[str] = [], + metadata_columns: Optional[list[str]] = None, ignore_metadata_columns: Optional[list[str]] = None, id_column: str = "langchain_id", metadata_json_column: Optional[str] = "langchain_metadata", @@ -73,15 +55,15 @@ async def create( fetch_k: int = 20, lambda_mult: float = 0.5, index_query_options: Optional[QueryOptions] = None, - ) -> AlloyDBVectorStore: - """Create an AlloyDBVectorStore instance. + ) -> PGVectorStore: + """Create an PGVectorStore instance. Args: - engine (AlloyDBEngine): Connection pool engine for managing connections to AlloyDB database. + engine (AlloyDBEngine): Connection pool engine for managing connections to postgres database. embedding_service (Embeddings): Text embedding model to use. table_name (str): Name of an existing table. schema_name (str, optional): Name of the database schema. Defaults to "public". - content_column (str): Column that represent a Document’s page_content. Defaults to "content". + content_column (str): Column that represent a Document's page_content. Defaults to "content". embedding_column (str): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". metadata_columns (list[str]): Column(s) that represent a document's metadata. ignore_metadata_columns (list[str]): Column(s) to ignore in pre-existing tables for a document's metadata. Can not be used with metadata_columns. Defaults to None. @@ -94,7 +76,7 @@ async def create( index_query_options (QueryOptions): Index query option. Returns: - AlloyDBVectorStore + PGVectorStore """ coro = AsyncAlloyDBVectorStore.create( engine, @@ -114,18 +96,18 @@ async def create( index_query_options=index_query_options, ) vs = await engine._run_as_async(coro) - return cls(cls.__create_key, engine, vs) + return cls(cls._PGVectorStore__create_key, engine, vs) # type: ignore @classmethod def create_sync( - cls, - engine: AlloyDBEngine, + cls: type[AlloyDBVectorStore], + engine: AlloyDBEngine, # type: ignore embedding_service: Embeddings, table_name: str, schema_name: str = "public", content_column: str = "content", embedding_column: str = "embedding", - metadata_columns: list[str] = [], + metadata_columns: Optional[list[str]] = None, ignore_metadata_columns: Optional[list[str]] = None, id_column: str = "langchain_id", metadata_json_column: str = "langchain_metadata", @@ -134,18 +116,18 @@ def create_sync( fetch_k: int = 20, lambda_mult: float = 0.5, index_query_options: Optional[QueryOptions] = None, - ) -> AlloyDBVectorStore: - """Create an AlloyDBVectorStore instance. + ) -> PGVectorStore: + """Create an PGVectorStore instance. Args: key (object): Prevent direct constructor usage. - engine (AlloyDBEngine): Connection pool engine for managing connections to AlloyDB database. + engine (AlloyDBEngine): Connection pool engine for managing connections to postgres database. embedding_service (Embeddings): Text embedding model to use. table_name (str): Name of an existing table. schema_name (str, optional): Name of the database schema. Defaults to "public". - content_column (str, optional): Column that represent a Document’s page_content. Defaults to "content". + content_column (str, optional): Column that represent a Document's page_content. Defaults to "content". embedding_column (str, optional): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". - metadata_columns (list[str]): Column(s) that represent a document's metadata. Defaults to an empty list. + metadata_columns (list[str], optional): Column(s) that represent a document's metadata. Defaults to None. ignore_metadata_columns (Optional[list[str]]): Column(s) to ignore in pre-existing tables for a document's metadata. Can not be used with metadata_columns. Defaults to None. id_column (str, optional): Column that represents the Document's id. Defaults to "langchain_id". metadata_json_column (str, optional): Column to store metadata as JSON. Defaults to "langchain_metadata". @@ -156,7 +138,7 @@ def create_sync( index_query_options (Optional[QueryOptions], optional): Index query option. Defaults to None. Returns: - AlloyDBVectorStore + PGVectorStore """ coro = AsyncAlloyDBVectorStore.create( engine, @@ -176,55 +158,7 @@ def create_sync( index_query_options=index_query_options, ) vs = engine._run_as_sync(coro) - return cls(cls.__create_key, engine, vs) - - @property - def embeddings(self) -> Embeddings: - return self.__vs.embedding_service - - async def aadd_embeddings( - self, - texts: Iterable[str], - embeddings: list[list[float]], - metadatas: Optional[list[dict]] = None, - ids: Optional[list[str]] = None, - **kwargs: Any, - ) -> list[str]: - """Add data along with embeddings to the table.""" - return await self._engine._run_as_async( - self.__vs.aadd_embeddings(texts, embeddings, metadatas, ids, **kwargs) - ) - - async def aadd_texts( - self, - texts: Iterable[str], - metadatas: Optional[list[dict]] = None, - ids: Optional[list] = None, - **kwargs: Any, - ) -> list[str]: - """Embed texts and add to the table. - - Raises: - :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. - """ - return await self._engine._run_as_async( - self.__vs.aadd_texts(texts, metadatas, ids, **kwargs) - ) - - async def aadd_documents( - self, - documents: list[Document], - ids: Optional[list] = None, - **kwargs: Any, - ) -> list[str]: - """Embed documents and add to the table. - - Raises: - :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. - """ - return await self._engine._run_as_async( - self.__vs.aadd_documents(documents, ids, **kwargs) - ) + return cls(cls._PGVectorStore__create_key, engine, vs) # type: ignore async def aadd_images( self, @@ -235,51 +169,7 @@ async def aadd_images( ) -> list[str]: """Embed images and add to the table.""" return await self._engine._run_as_async( - self.__vs.aadd_images(uris, metadatas, ids, **kwargs) - ) - - def add_embeddings( - self, - texts: Iterable[str], - embeddings: list[list[float]], - metadatas: Optional[list[dict]] = None, - ids: Optional[list[str]] = None, - **kwargs: Any, - ) -> list[str]: - """Add data along with embeddings to the table.""" - return self._engine._run_as_sync( - self.__vs.aadd_embeddings(texts, embeddings, metadatas, ids, **kwargs) - ) - - def add_texts( - self, - texts: Iterable[str], - metadatas: Optional[list[dict]] = None, - ids: Optional[list] = None, - **kwargs: Any, - ) -> list[str]: - """Embed texts and add to the table. - - Raises: - :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. - """ - return self._engine._run_as_sync( - self.__vs.aadd_texts(texts, metadatas, ids, **kwargs) - ) - - def add_documents( - self, - documents: list[Document], - ids: Optional[list] = None, - **kwargs: Any, - ) -> list[str]: - """Embed documents and add to the table. - - Raises: - :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. - """ - return self._engine._run_as_sync( - self.__vs.aadd_documents(documents, ids, **kwargs) + self._PGVectorStore__vs.aadd_images(uris, metadatas, ids, **kwargs) # type: ignore ) def add_images( @@ -291,545 +181,31 @@ def add_images( ) -> list[str]: """Embed images and add to the table.""" return self._engine._run_as_sync( - self.__vs.aadd_images(uris, metadatas, ids, **kwargs) - ) - - async def adelete( - self, - ids: Optional[list] = None, - **kwargs: Any, - ) -> Optional[bool]: - """Delete records from the table. - - Raises: - :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. - """ - return await self._engine._run_as_async(self.__vs.adelete(ids, **kwargs)) - - def delete( - self, - ids: Optional[list] = None, - **kwargs: Any, - ) -> Optional[bool]: - """Delete records from the table. - - Raises: - :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. - """ - return self._engine._run_as_sync(self.__vs.adelete(ids, **kwargs)) - - @classmethod - async def afrom_texts( # type: ignore[override] - cls: type[AlloyDBVectorStore], - texts: list[str], - embedding: Embeddings, - engine: AlloyDBEngine, - table_name: str, - schema_name: str = "public", - metadatas: Optional[list[dict]] = None, - ids: Optional[list] = None, - content_column: str = "content", - embedding_column: str = "embedding", - metadata_columns: list[str] = [], - ignore_metadata_columns: Optional[list[str]] = None, - id_column: str = "langchain_id", - metadata_json_column: str = "langchain_metadata", - distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - index_query_options: Optional[QueryOptions] = None, - **kwargs: Any, - ) -> AlloyDBVectorStore: - """Create an AlloyDBVectorStore instance from texts. - - Args: - texts (list[str]): Texts to add to the vector store. - embedding (Embeddings): Text embedding model to use. - engine (AlloyDBEngine): Connection pool engine for managing connections to AlloyDB database. - table_name (str): Name of an existing table. - schema_name (str, optional): Name of the database schema. Defaults to "public". - metadatas (Optional[list[dict]], optional): List of metadatas to add to table records. Defaults to None. - ids: (Optional[list]): List of IDs to add to table records. Defaults to None. - content_column (str, optional): Column that represent a Document’s page_content. Defaults to "content". - embedding_column (str, optional): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". - metadata_columns (list[str], optional): Column(s) that represent a document's metadata. Defaults to an empty list. - ignore_metadata_columns (Optional[list[str]], optional): Column(s) to ignore in pre-existing tables for a document's metadata. Can not be used with metadata_columns. Defaults to None. - id_column (str, optional): Column that represents the Document's id. Defaults to "langchain_id". - metadata_json_column (str, optional): Column to store metadata as JSON. Defaults to "langchain_metadata". - distance_strategy (DistanceStrategy): Distance strategy to use for vector similarity search. Defaults to COSINE_DISTANCE. - k (int): Number of Documents to return from search. Defaults to 4. - fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. - lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. - index_query_options (QueryOptions): Index query option. - - Raises: - :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. - - Returns: - AlloyDBVectorStore - """ - vs = await cls.create( - engine, - embedding, - table_name, - schema_name=schema_name, - content_column=content_column, - embedding_column=embedding_column, - metadata_columns=metadata_columns, - ignore_metadata_columns=ignore_metadata_columns, - metadata_json_column=metadata_json_column, - id_column=id_column, - distance_strategy=distance_strategy, - k=k, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - index_query_options=index_query_options, - ) - await vs.aadd_texts(texts, metadatas=metadatas, ids=ids) - return vs - - @classmethod - async def afrom_documents( # type: ignore[override] - cls: type[AlloyDBVectorStore], - documents: list[Document], - embedding: Embeddings, - engine: AlloyDBEngine, - table_name: str, - schema_name: str = "public", - ids: Optional[list] = None, - content_column: str = "content", - embedding_column: str = "embedding", - metadata_columns: list[str] = [], - ignore_metadata_columns: Optional[list[str]] = None, - id_column: str = "langchain_id", - metadata_json_column: str = "langchain_metadata", - distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - index_query_options: Optional[QueryOptions] = None, - **kwargs: Any, - ) -> AlloyDBVectorStore: - """Create an AlloyDBVectorStore instance from documents. - - Args: - documents (list[Document]): Documents to add to the vector store. - embedding (Embeddings): Text embedding model to use. - engine (AlloyDBEngine): Connection pool engine for managing connections to AlloyDB database. - table_name (str): Name of an existing table. - schema_name (str, optional): Name of the database schema. Defaults to "public". - ids: (Optional[list]): List of IDs to add to table records. Defaults to None. - content_column (str, optional): Column that represent a Document’s page_content. Defaults to "content". - embedding_column (str, optional): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". - metadata_columns (list[str], optional): Column(s) that represent a document's metadata. Defaults to an empty list. - ignore_metadata_columns (Optional[list[str]], optional): Column(s) to ignore in pre-existing tables for a document's metadata. Can not be used with metadata_columns. Defaults to None. - id_column (str, optional): Column that represents the Document's id. Defaults to "langchain_id". - metadata_json_column (str, optional): Column to store metadata as JSON. Defaults to "langchain_metadata". - distance_strategy (DistanceStrategy): Distance strategy to use for vector similarity search. Defaults to COSINE_DISTANCE. - k (int): Number of Documents to return from search. Defaults to 4. - fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. - lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. - index_query_options (QueryOptions): Index query option. - - Raises: - :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. - - Returns: - AlloyDBVectorStore - """ - - vs = await cls.create( - engine, - embedding, - table_name, - schema_name=schema_name, - content_column=content_column, - embedding_column=embedding_column, - metadata_columns=metadata_columns, - ignore_metadata_columns=ignore_metadata_columns, - metadata_json_column=metadata_json_column, - id_column=id_column, - distance_strategy=distance_strategy, - k=k, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - index_query_options=index_query_options, - ) - await vs.aadd_documents(documents, ids=ids) - return vs - - @classmethod - def from_texts( # type: ignore[override] - cls: type[AlloyDBVectorStore], - texts: list[str], - embedding: Embeddings, - engine: AlloyDBEngine, - table_name: str, - schema_name: str = "public", - metadatas: Optional[list[dict]] = None, - ids: Optional[list] = None, - content_column: str = "content", - embedding_column: str = "embedding", - metadata_columns: list[str] = [], - ignore_metadata_columns: Optional[list[str]] = None, - id_column: str = "langchain_id", - metadata_json_column: str = "langchain_metadata", - distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - index_query_options: Optional[QueryOptions] = None, - **kwargs: Any, - ) -> AlloyDBVectorStore: - """Create an AlloyDBVectorStore instance from texts. - - Args: - texts (list[str]): Texts to add to the vector store. - embedding (Embeddings): Text embedding model to use. - engine (AlloyDBEngine): Connection pool engine for managing connections to AlloyDB database. - table_name (str): Name of an existing table. - schema_name (str, optional): Name of the database schema. Defaults to "public". - metadatas (Optional[list[dict]], optional): List of metadatas to add to table records. Defaults to None. - ids: (Optional[list]): List of IDs to add to table records. Defaults to None. - content_column (str, optional): Column that represent a Document’s page_content. Defaults to "content". - embedding_column (str, optional): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". - metadata_columns (list[str], optional): Column(s) that represent a document's metadata. Defaults to empty list. - ignore_metadata_columns (Optional[list[str]], optional): Column(s) to ignore in pre-existing tables for a document's metadata. Can not be used with metadata_columns. Defaults to None. - id_column (str, optional): Column that represents the Document's id. Defaults to "langchain_id". - metadata_json_column (str, optional): Column to store metadata as JSON. Defaults to "langchain_metadata". - distance_strategy (DistanceStrategy): Distance strategy to use for vector similarity search. Defaults to COSINE_DISTANCE. - k (int): Number of Documents to return from search. Defaults to 4. - fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. - lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. - index_query_options (QueryOptions): Index query option. - - Raises: - :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. - - Returns: - AlloyDBVectorStore - """ - vs = cls.create_sync( - engine, - embedding, - table_name, - schema_name=schema_name, - content_column=content_column, - embedding_column=embedding_column, - metadata_columns=metadata_columns, - ignore_metadata_columns=ignore_metadata_columns, - metadata_json_column=metadata_json_column, - id_column=id_column, - distance_strategy=distance_strategy, - k=k, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - index_query_options=index_query_options, - **kwargs, - ) - vs.add_texts(texts, metadatas=metadatas, ids=ids) - return vs - - @classmethod - def from_documents( # type: ignore[override] - cls: type[AlloyDBVectorStore], - documents: list[Document], - embedding: Embeddings, - engine: AlloyDBEngine, - table_name: str, - schema_name: str = "public", - ids: Optional[list] = None, - content_column: str = "content", - embedding_column: str = "embedding", - metadata_columns: list[str] = [], - ignore_metadata_columns: Optional[list[str]] = None, - id_column: str = "langchain_id", - metadata_json_column: str = "langchain_metadata", - distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - index_query_options: Optional[QueryOptions] = None, - **kwargs: Any, - ) -> AlloyDBVectorStore: - """Create an AlloyDBVectorStore instance from documents. - - Args: - documents (list[Document]): Documents to add to the vector store. - embedding (Embeddings): Text embedding model to use. - engine (AlloyDBEngine): Connection pool engine for managing connections to AlloyDB database. - table_name (str): Name of an existing table. - schema_name (str, optional): Name of the database schema. Defaults to "public". - ids: (Optional[list]): List of IDs to add to table records. Defaults to None. - content_column (str, optional): Column that represent a Document’s page_content. Defaults to "content". - embedding_column (str, optional): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". - metadata_columns (list[str], optional): Column(s) that represent a document's metadata. Defaults to an empty list. - ignore_metadata_columns (Optional[list[str]], optional): Column(s) to ignore in pre-existing tables for a document's metadata. Can not be used with metadata_columns. Defaults to None. - id_column (str, optional): Column that represents the Document's id. Defaults to "langchain_id". - metadata_json_column (str, optional): Column to store metadata as JSON. Defaults to "langchain_metadata". - distance_strategy (DistanceStrategy): Distance strategy to use for vector similarity search. Defaults to COSINE_DISTANCE. - k (int): Number of Documents to return from search. Defaults to 4. - fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. - lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. - index_query_options (QueryOptions): Index query option. - - Raises: - :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. - - Returns: - AlloyDBVectorStore - """ - vs = cls.create_sync( - engine, - embedding, - table_name, - schema_name=schema_name, - content_column=content_column, - embedding_column=embedding_column, - metadata_columns=metadata_columns, - ignore_metadata_columns=ignore_metadata_columns, - metadata_json_column=metadata_json_column, - id_column=id_column, - distance_strategy=distance_strategy, - k=k, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - index_query_options=index_query_options, - **kwargs, - ) - vs.add_documents(documents, ids=ids) - return vs - - def similarity_search( - self, - query: str, - k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[Document]: - """Return docs selected by similarity search on query.""" - return self._engine._run_as_sync( - self.__vs.asimilarity_search(query, k, filter, **kwargs) + self._PGVectorStore__vs.aadd_images(uris, metadatas, ids, **kwargs) # type: ignore ) def similarity_search_image( self, image_uri: str, k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, + filter: Optional[dict] = None, **kwargs: Any, ) -> list[Document]: """Return docs selected by similarity search on image.""" return self._engine._run_as_sync( - self.__vs.asimilarity_search_image(image_uri, k, filter, **kwargs) - ) - - async def asimilarity_search( - self, - query: str, - k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[Document]: - """Return docs selected by similarity search on query.""" - return await self._engine._run_as_async( - self.__vs.asimilarity_search(query, k, filter, **kwargs) + self._PGVectorStore__vs.asimilarity_search_image(image_uri, k, filter, **kwargs) # type: ignore ) async def asimilarity_search_image( self, image_uri: str, k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, + filter: Optional[dict] = None, **kwargs: Any, ) -> list[Document]: """Return docs selected by similarity search on image_uri.""" return await self._engine._run_as_async( - self.__vs.asimilarity_search_image(image_uri, k, filter, **kwargs) - ) - - # Required for (a)similarity_search_with_relevance_scores - def _select_relevance_score_fn(self) -> Callable[[float], float]: - """Select a relevance function based on distance strategy.""" - # Calculate distance strategy provided in vectorstore constructor - if self.__vs.distance_strategy == DistanceStrategy.COSINE_DISTANCE: - return self._cosine_relevance_score_fn - if self.__vs.distance_strategy == DistanceStrategy.INNER_PRODUCT: - return self._max_inner_product_relevance_score_fn - elif self.__vs.distance_strategy == DistanceStrategy.EUCLIDEAN: - return self._euclidean_relevance_score_fn - - async def asimilarity_search_with_score( - self, - query: str, - k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[tuple[Document, float]]: - """Return docs and distance scores selected by similarity search on query.""" - return await self._engine._run_as_async( - self.__vs.asimilarity_search_with_score(query, k, filter, **kwargs) - ) - - async def asimilarity_search_by_vector( - self, - embedding: list[float], - k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[Document]: - """Return docs selected by vector similarity search.""" - return await self._engine._run_as_async( - self.__vs.asimilarity_search_by_vector(embedding, k, filter, **kwargs) - ) - - async def asimilarity_search_with_score_by_vector( - self, - embedding: list[float], - k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[tuple[Document, float]]: - """Return docs and distance scores selected by vector similarity search.""" - return await self._engine._run_as_async( - self.__vs.asimilarity_search_with_score_by_vector( - embedding, k, filter, **kwargs - ) - ) - - async def amax_marginal_relevance_search( - self, - query: str, - k: Optional[int] = None, - fetch_k: Optional[int] = None, - lambda_mult: Optional[float] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[Document]: - """Return docs selected using the maximal marginal relevance.""" - return await self._engine._run_as_async( - self.__vs.amax_marginal_relevance_search( - query, k, fetch_k, lambda_mult, filter, **kwargs - ) - ) - - async def amax_marginal_relevance_search_by_vector( - self, - embedding: list[float], - k: Optional[int] = None, - fetch_k: Optional[int] = None, - lambda_mult: Optional[float] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[Document]: - """Return docs selected using the maximal marginal relevance.""" - return await self._engine._run_as_async( - self.__vs.amax_marginal_relevance_search_by_vector( - embedding, k, fetch_k, lambda_mult, filter, **kwargs - ) - ) - - async def amax_marginal_relevance_search_with_score_by_vector( - self, - embedding: list[float], - k: Optional[int] = None, - fetch_k: Optional[int] = None, - lambda_mult: Optional[float] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[tuple[Document, float]]: - """Return docs and distance scores selected using the maximal marginal relevance.""" - return await self._engine._run_as_async( - self.__vs.amax_marginal_relevance_search_with_score_by_vector( - embedding, k, fetch_k, lambda_mult, filter, **kwargs - ) - ) - - def similarity_search_with_score( - self, - query: str, - k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[tuple[Document, float]]: - """Return docs and distance scores selected by similarity search on query.""" - return self._engine._run_as_sync( - self.__vs.asimilarity_search_with_score(query, k, filter, **kwargs) - ) - - def similarity_search_by_vector( - self, - embedding: list[float], - k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[Document]: - """Return docs selected by vector similarity search.""" - return self._engine._run_as_sync( - self.__vs.asimilarity_search_by_vector(embedding, k, filter, **kwargs) - ) - - def similarity_search_with_score_by_vector( - self, - embedding: list[float], - k: Optional[int] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[tuple[Document, float]]: - """Return docs and distance scores selected by similarity search on vector.""" - return self._engine._run_as_sync( - self.__vs.asimilarity_search_with_score_by_vector( - embedding, k, filter, **kwargs - ) - ) - - def max_marginal_relevance_search( - self, - query: str, - k: Optional[int] = None, - fetch_k: Optional[int] = None, - lambda_mult: Optional[float] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[Document]: - """Return docs selected using the maximal marginal relevance.""" - return self._engine._run_as_sync( - self.__vs.amax_marginal_relevance_search( - query, k, fetch_k, lambda_mult, filter, **kwargs - ) - ) - - def max_marginal_relevance_search_by_vector( - self, - embedding: list[float], - k: Optional[int] = None, - fetch_k: Optional[int] = None, - lambda_mult: Optional[float] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[Document]: - """Return docs selected using the maximal marginal relevance.""" - return self._engine._run_as_sync( - self.__vs.amax_marginal_relevance_search_by_vector( - embedding, k, fetch_k, lambda_mult, filter, **kwargs - ) - ) - - def max_marginal_relevance_search_with_score_by_vector( - self, - embedding: list[float], - k: Optional[int] = None, - fetch_k: Optional[int] = None, - lambda_mult: Optional[float] = None, - filter: Optional[dict] | Optional[str] = None, - **kwargs: Any, - ) -> list[tuple[Document, float]]: - """Return docs and distance scores selected using the maximal marginal relevance.""" - return self._engine._run_as_sync( - self.__vs.amax_marginal_relevance_search_with_score_by_vector( - embedding, k, fetch_k, lambda_mult, filter, **kwargs - ) + self._PGVectorStore__vs.asimilarity_search_image(image_uri, k, filter, **kwargs) # type: ignore ) async def aset_maintenance_work_mem( @@ -837,82 +213,11 @@ async def aset_maintenance_work_mem( ) -> None: """Set database maintenance work memory (for ScaNN index creation).""" await self._engine._run_as_async( - self.__vs.set_maintenance_work_mem(num_leaves, vector_size) + self._PGVectorStore__vs.set_maintenance_work_mem(num_leaves, vector_size) # type: ignore ) def set_maintenance_work_mem(self, num_leaves: int, vector_size: int) -> None: """Set database maintenance work memory (for ScaNN index creation).""" self._engine._run_as_sync( - self.__vs.set_maintenance_work_mem(num_leaves, vector_size) - ) - - async def aapply_vector_index( - self, - index: BaseIndex, - name: Optional[str] = None, - concurrently: bool = False, - ) -> None: - """Create an index on the vector store table.""" - return await self._engine._run_as_async( - self.__vs.aapply_vector_index(index, name, concurrently) - ) - - def apply_vector_index( - self, - index: BaseIndex, - name: Optional[str] = None, - concurrently: bool = False, - ) -> None: - """Create an index on the vector store table.""" - return self._engine._run_as_sync( - self.__vs.aapply_vector_index(index, name, concurrently) - ) - - async def areindex(self, index_name: Optional[str] = None) -> None: - """Re-index the vector store table.""" - return await self._engine._run_as_async(self.__vs.areindex(index_name)) - - def reindex(self, index_name: Optional[str] = None) -> None: - """Re-index the vector store table.""" - return self._engine._run_as_sync(self.__vs.areindex(index_name)) - - async def adrop_vector_index( - self, - index_name: Optional[str] = None, - ) -> None: - """Drop the vector index.""" - return await self._engine._run_as_async( - self.__vs.adrop_vector_index(index_name) + self._PGVectorStore__vs.set_maintenance_work_mem(num_leaves, vector_size) # type: ignore ) - - def drop_vector_index( - self, - index_name: Optional[str] = None, - ) -> None: - """Drop the vector index.""" - return self._engine._run_as_sync(self.__vs.adrop_vector_index(index_name)) - - async def ais_valid_index( - self, - index_name: Optional[str] = None, - ) -> bool: - """Check if index exists in the table.""" - return await self._engine._run_as_async(self.__vs.is_valid_index(index_name)) - - def is_valid_index( - self, - index_name: Optional[str] = None, - ) -> bool: - """Check if index exists in the table.""" - return self._engine._run_as_sync(self.__vs.is_valid_index(index_name)) - - async def aget_by_ids(self, ids: Sequence[str]) -> list[Document]: - """Get documents by ids.""" - return await self._engine._run_as_async(self.__vs.aget_by_ids(ids=ids)) - - def get_by_ids(self, ids: Sequence[str]) -> list[Document]: - """Get documents by ids.""" - return self._engine._run_as_sync(self.__vs.aget_by_ids(ids=ids)) - - def get_table_name(self) -> str: - return self.__vs.table_name diff --git a/tests/metadata_filtering_data.py b/tests/metadata_filtering_data.py deleted file mode 100644 index d983e331..00000000 --- a/tests/metadata_filtering_data.py +++ /dev/null @@ -1,263 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -METADATAS = [ - { - "name": "Wireless Headphones", - "code": "WH001", - "price": 149.99, - "is_available": True, - "release_date": "2023-10-26", - "tags": ["audio", "wireless", "electronics"], - "dimensions": [18.5, 7.2, 21.0], - "inventory_location": [101, 102], - "available_quantity": 50, - }, - { - "name": "Ergonomic Office Chair", - "code": "EC002", - "price": 299.00, - "is_available": True, - "release_date": "2023-08-15", - "tags": ["furniture", "office", "ergonomic"], - "dimensions": [65.0, 60.0, 110.0], - "inventory_location": [201], - "available_quantity": 10, - }, - { - "name": "Stainless Steel Water Bottle", - "code": "WB003", - "price": 25.50, - "is_available": False, - "release_date": "2024-01-05", - "tags": ["hydration", "eco-friendly", "kitchen"], - "dimensions": [7.5, 7.5, 25.0], - "available_quantity": 0, - }, - { - "name": "Smart Fitness Tracker", - "code": "FT004", - "price": 79.95, - "is_available": True, - "release_date": "2023-11-12", - "tags": ["fitness", "wearable", "technology"], - "dimensions": [2.0, 1.0, 25.0], - "inventory_location": [401], - "available_quantity": 100, - }, -] - -FILTERING_TEST_CASES = [ - # These tests only involve equality checks - ( - {"code": "FT004"}, - ["FT004"], - ), - # String field - ( - # check name - {"name": "Smart Fitness Tracker"}, - ["FT004"], - ), - # Boolean fields - ( - {"is_available": True}, - ["WH001", "FT004", "EC002"], - ), - # And semantics for top level filtering - ( - {"code": "WH001", "is_available": True}, - ["WH001"], - ), - # These involve equality checks and other operators - # like $ne, $gt, $gte, $lt, $lte - ( - {"available_quantity": {"$eq": 10}}, - ["EC002"], - ), - ( - {"available_quantity": {"$ne": 0}}, - ["WH001", "FT004", "EC002"], - ), - ( - {"available_quantity": {"$gt": 60}}, - ["FT004"], - ), - ( - {"available_quantity": {"$gte": 50}}, - ["WH001", "FT004"], - ), - ( - {"available_quantity": {"$lt": 5}}, - ["WB003"], - ), - ( - {"available_quantity": {"$lte": 10}}, - ["WB003", "EC002"], - ), - # Repeat all the same tests with name (string column) - ( - {"code": {"$eq": "WH001"}}, - ["WH001"], - ), - ( - {"code": {"$ne": "WB003"}}, - ["WH001", "FT004", "EC002"], - ), - # And also gt, gte, lt, lte relying on lexicographical ordering - ( - {"name": {"$gt": "Wireless Headphones"}}, - [], - ), - ( - {"name": {"$gte": "Wireless Headphones"}}, - ["WH001"], - ), - ( - {"name": {"$lt": "Smart Fitness Tracker"}}, - ["EC002"], - ), - ( - {"name": {"$lte": "Smart Fitness Tracker"}}, - ["FT004", "EC002"], - ), - ( - {"is_available": {"$eq": True}}, - ["WH001", "FT004", "EC002"], - ), - ( - {"is_available": {"$ne": True}}, - ["WB003"], - ), - # Test float column. - ( - {"price": {"$gt": 200.0}}, - ["EC002"], - ), - ( - {"price": {"$gte": 149.99}}, - ["WH001", "EC002"], - ), - ( - {"price": {"$lt": 50.0}}, - ["WB003"], - ), - ( - {"price": {"$lte": 79.95}}, - ["FT004", "WB003"], - ), - # These involve usage of AND, OR and NOT operators - ( - {"$or": [{"code": "WH001"}, {"code": "EC002"}]}, - ["WH001", "EC002"], - ), - ( - {"$or": [{"code": "WH001"}, {"available_quantity": 10}]}, - ["WH001", "EC002"], - ), - ( - {"$and": [{"code": "WH001"}, {"code": "EC002"}]}, - [], - ), - # Test for $not operator - ( - {"$not": {"code": "WB003"}}, - ["WH001", "FT004", "EC002"], - ), - ( - {"$not": [{"code": "WB003"}]}, - ["WH001", "FT004", "EC002"], - ), - ( - {"$not": {"available_quantity": 0}}, - ["WH001", "FT004", "EC002"], - ), - ( - {"$not": [{"available_quantity": 0}]}, - ["WH001", "FT004", "EC002"], - ), - ( - {"$not": {"is_available": True}}, - ["WB003"], - ), - ( - {"$not": [{"is_available": True}]}, - ["WB003"], - ), - ( - {"$not": {"price": {"$gt": 150.0}}}, - ["WH001", "FT004", "WB003"], - ), - ( - {"$not": [{"price": {"$gt": 150.0}}]}, - ["WH001", "FT004", "WB003"], - ), - # These involve special operators like $in, $nin, $between - # Test between - ( - {"available_quantity": {"$between": (40, 60)}}, - ["WH001"], - ), - # Test in - ( - {"name": {"$in": ["Smart Fitness Tracker", "Stainless Steel Water Bottle"]}}, - ["FT004", "WB003"], - ), - # With numeric fields - ( - {"available_quantity": {"$in": [0, 10]}}, - ["WB003", "EC002"], - ), - # Test nin - ( - {"name": {"$nin": ["Smart Fitness Tracker", "Stainless Steel Water Bottle"]}}, - ["WH001", "EC002"], - ), - ## with numeric fields - ( - {"available_quantity": {"$nin": [50, 0, 10]}}, - ["FT004"], - ), - # These involve special operators like $like, $ilike that - # may be specified to certain databases. - ( - {"name": {"$like": "Wireless%"}}, - ["WH001"], - ), - ( - {"name": {"$like": "%less%"}}, # adam and jane - ["WH001", "WB003"], - ), - # These involve the special operator $exists - ( - {"tags": {"$exists": False}}, - [], - ), - ( - {"inventory_location": {"$exists": False}}, - ["WB003"], - ), -] - -NEGATIVE_TEST_CASES = [ - {"$nor": [{"code": "WH001"}, {"code": "EC002"}]}, - {"$and": {"is_available": True}}, - {"is_available": {"$and": True}}, - {"is_available": {"name": "{Wireless Headphones", "code": "EC002"}}, - {"my column": {"$and": True}}, - {"is_available": {"code": "WH001", "code": "EC002"}}, - {"$and": {}}, - {"$and": []}, - {"$not": True}, -] diff --git a/tests/test_async_vectorstore.py b/tests/test_async_vectorstore.py index 9b450287..40ab320b 100644 --- a/tests/test_async_vectorstore.py +++ b/tests/test_async_vectorstore.py @@ -27,9 +27,6 @@ from langchain_google_alloydb_pg import AlloyDBEngine, Column from langchain_google_alloydb_pg.async_vectorstore import AsyncAlloyDBVectorStore -DEFAULT_TABLE = "test_table" + str(uuid.uuid4()) -DEFAULT_TABLE_SYNC = "test_table_sync" + str(uuid.uuid4()) -CUSTOM_TABLE = "test-table-custom" + str(uuid.uuid4()) IMAGE_TABLE = "test_image_table" + str(uuid.uuid4()) VECTOR_SIZE = 768 @@ -107,43 +104,9 @@ async def engine(self, db_project, db_region, db_cluster, db_instance, db_name): ) yield engine - await aexecute(engine, f'DROP TABLE IF EXISTS "{DEFAULT_TABLE}"') - await aexecute(engine, f'DROP TABLE IF EXISTS "{CUSTOM_TABLE}"') + await aexecute(engine, f'DROP TABLE IF EXISTS "{IMAGE_TABLE}"') await engine.close() - @pytest_asyncio.fixture(scope="class") - async def vs(self, engine): - await engine._ainit_vectorstore_table(DEFAULT_TABLE, VECTOR_SIZE) - vs = await AsyncAlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=DEFAULT_TABLE, - ) - yield vs - - @pytest_asyncio.fixture(scope="class") - async def vs_custom(self, engine): - await engine._ainit_vectorstore_table( - CUSTOM_TABLE, - VECTOR_SIZE, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], - metadata_json_column="mymeta", - ) - vs = await AsyncAlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=["page", "source"], - metadata_json_column="mymeta", - ) - yield vs - @pytest_asyncio.fixture(scope="class") async def image_vs(self, engine): await engine._ainit_vectorstore_table( @@ -183,65 +146,6 @@ async def image_uris(self): except FileNotFoundError: pass - async def test_init_with_constructor(self, engine): - with pytest.raises(Exception): - AsyncAlloyDBVectorStore( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="noname", - embedding_column="myembedding", - metadata_columns=["page", "source"], - metadata_json_column="mymeta", - ) - - async def test_post_init(self, engine): - with pytest.raises(ValueError): - await AsyncAlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="noname", - embedding_column="myembedding", - metadata_columns=["page", "source"], - metadata_json_column="mymeta", - ) - - async def test_aadd_texts(self, engine, vs): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs.aadd_texts(texts, ids=ids) - results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') - assert len(results) == 3 - - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs.aadd_texts(texts, metadatas, ids) - results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') - assert len(results) == 6 - await aexecute(engine, f'TRUNCATE TABLE "{DEFAULT_TABLE}"') - - async def test_aadd_texts_edge_cases(self, engine, vs): - texts = ["Taylor's", '"Swift"', "best-friend"] - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs.aadd_texts(texts, ids=ids) - results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') - assert len(results) == 3 - await aexecute(engine, f'TRUNCATE TABLE "{DEFAULT_TABLE}"') - - async def test_aadd_docs(self, engine, vs): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs.aadd_documents(docs, ids=ids) - results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') - assert len(results) == 3 - await aexecute(engine, f'TRUNCATE TABLE "{DEFAULT_TABLE}"') - - async def test_aadd_docs_no_ids(self, engine, vs): - await vs.aadd_documents(docs) - results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') - assert len(results) == 3 - await aexecute(engine, f'TRUNCATE TABLE "{DEFAULT_TABLE}"') - async def test_aadd_images(self, engine, image_vs, image_uris): ids = [str(uuid.uuid4()) for i in range(len(image_uris))] metadatas = [ @@ -253,170 +157,3 @@ async def test_aadd_images(self, engine, image_vs, image_uris): assert results[0]["image_id"] == "0" assert results[0]["source"] == "google.com" await aexecute(engine, (f'TRUNCATE TABLE "{IMAGE_TABLE}"')) - - async def test_adelete(self, engine, vs): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs.aadd_texts(texts, ids=ids) - results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') - assert len(results) == 3 - # delete an ID - await vs.adelete([ids[0]]) - results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') - assert len(results) == 2 - # delete with no ids - result = await vs.adelete() - assert result == False - await aexecute(engine, f'TRUNCATE TABLE "{DEFAULT_TABLE}"') - - ##### Custom Vector Store ##### - async def test_aadd_embeddings(self, engine, vs_custom): - await vs_custom.aadd_embeddings( - texts=texts, embeddings=embeddings, metadatas=metadatas - ) - results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') - assert len(results) == 3 - assert results[0]["mycontent"] == "foo" - assert results[0]["myembedding"] - assert results[0]["page"] == "0" - assert results[0]["source"] == "google.com" - await aexecute(engine, f'TRUNCATE TABLE "{CUSTOM_TABLE}"') - - async def test_aadd_texts_custom(self, engine, vs_custom): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs_custom.aadd_texts(texts, ids=ids) - results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') - assert len(results) == 3 - assert results[0]["mycontent"] == "foo" - assert results[0]["myembedding"] - assert results[0]["page"] is None - assert results[0]["source"] is None - - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs_custom.aadd_texts(texts, metadatas, ids) - results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') - assert len(results) == 6 - await aexecute(engine, f'TRUNCATE TABLE "{CUSTOM_TABLE}"') - - async def test_aadd_docs_custom(self, engine, vs_custom): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - docs = [ - Document( - page_content=texts[i], - metadata={"page": str(i), "source": "google.com"}, - ) - for i in range(len(texts)) - ] - await vs_custom.aadd_documents(docs, ids=ids) - - results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') - assert len(results) == 3 - assert results[0]["mycontent"] == "foo" - assert results[0]["myembedding"] - assert results[0]["page"] == "0" - assert results[0]["source"] == "google.com" - await aexecute(engine, f'TRUNCATE TABLE "{CUSTOM_TABLE}"') - - async def test_adelete_custom(self, engine, vs_custom): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs_custom.aadd_texts(texts, ids=ids) - results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') - content = [result["mycontent"] for result in results] - assert len(results) == 3 - assert "foo" in content - # delete an ID - await vs_custom.adelete([ids[0]]) - results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') - content = [result["mycontent"] for result in results] - assert len(results) == 2 - assert "foo" not in content - await aexecute(engine, f'TRUNCATE TABLE "{CUSTOM_TABLE}"') - - async def test_ignore_metadata_columns(self, engine): - column_to_ignore = "source" - vs = await AsyncAlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - ignore_metadata_columns=[column_to_ignore], - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - metadata_json_column="mymeta", - ) - assert column_to_ignore not in vs.metadata_columns - - async def test_create_vectorstore_with_invalid_parameters_1(self, engine): - with pytest.raises(ValueError): - await AsyncAlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=["random_column"], # invalid metadata column - ) - - async def test_create_vectorstore_with_invalid_parameters_2(self, engine): - with pytest.raises(ValueError): - await AsyncAlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="langchain_id", # invalid content column type - embedding_column="myembedding", - metadata_columns=["random_column"], - ) - - async def test_create_vectorstore_with_invalid_parameters_3(self, engine): - with pytest.raises(ValueError): - await AsyncAlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="mycontent", - embedding_column="random_column", # invalid embedding column - metadata_columns=["random_column"], - ) - - async def test_create_vectorstore_with_invalid_parameters_4(self, engine): - with pytest.raises(ValueError): - await AsyncAlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="mycontent", - embedding_column="langchain_id", # invalid embedding column data type - metadata_columns=["random_column"], - ) - - async def test_create_vectorstore_with_invalid_parameters_5(self, engine): - with pytest.raises(ValueError): - await AsyncAlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="mycontent", - embedding_column="langchain_id", - metadata_columns=["random_column"], - ignore_metadata_columns=[ - "one", - "two", - ], # invalid use of metadata_columns and ignore columns - ) - - async def test_create_vectorstore_with_init(self, engine): - with pytest.raises(Exception): - await AsyncAlloyDBVectorStore( - engine._pool, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=["random_column"], # invalid metadata column - ) diff --git a/tests/test_async_vectorstore_from_methods.py b/tests/test_async_vectorstore_from_methods.py deleted file mode 100644 index 42f68d21..00000000 --- a/tests/test_async_vectorstore_from_methods.py +++ /dev/null @@ -1,227 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import uuid -from typing import Sequence - -import pytest -import pytest_asyncio -from langchain_core.documents import Document -from langchain_core.embeddings import DeterministicFakeEmbedding -from sqlalchemy import text -from sqlalchemy.engine.row import RowMapping - -from langchain_google_alloydb_pg import AlloyDBEngine, Column -from langchain_google_alloydb_pg.async_vectorstore import AsyncAlloyDBVectorStore - -DEFAULT_TABLE = "test_table" + str(uuid.uuid4()).replace("-", "_") -DEFAULT_TABLE_SYNC = "test_table_sync" + str(uuid.uuid4()).replace("-", "_") -CUSTOM_TABLE = "test_table_custom" + str(uuid.uuid4()).replace("-", "_") -CUSTOM_TABLE_WITH_INT_ID = "test_table_with_int_id" + str(uuid.uuid4()).replace( - "-", "_" -) -VECTOR_SIZE = 768 - - -embeddings_service = DeterministicFakeEmbedding(size=VECTOR_SIZE) - -texts = ["foo", "bar", "baz"] -metadatas = [{"page": str(i), "source": "google.com"} for i in range(len(texts))] -docs = [ - Document(page_content=texts[i], metadata=metadatas[i]) for i in range(len(texts)) -] - -embeddings = [embeddings_service.embed_query(texts[i]) for i in range(len(texts))] - - -def get_env_var(key: str, desc: str) -> str: - v = os.environ.get(key) - if v is None: - raise ValueError(f"Must set env var {key} to: {desc}") - return v - - -async def aexecute(engine: AlloyDBEngine, query: str) -> None: - async with engine._pool.connect() as conn: - await conn.execute(text(query)) - await conn.commit() - - -async def afetch(engine: AlloyDBEngine, query: str) -> Sequence[RowMapping]: - async with engine._pool.connect() as conn: - result = await conn.execute(text(query)) - result_map = result.mappings() - result_fetch = result_map.fetchall() - return result_fetch - - -@pytest.mark.asyncio -class TestVectorStoreFromMethods: - @pytest.fixture(scope="module") - def db_project(self) -> str: - return get_env_var("PROJECT_ID", "project id for google cloud") - - @pytest.fixture(scope="module") - def db_region(self) -> str: - return get_env_var("REGION", "region for AlloyDB instance") - - @pytest.fixture(scope="module") - def db_cluster(self) -> str: - return get_env_var("CLUSTER_ID", "cluster for AlloyDB") - - @pytest.fixture(scope="module") - def db_instance(self) -> str: - return get_env_var("INSTANCE_ID", "instance for AlloyDB") - - @pytest.fixture(scope="module") - def db_name(self) -> str: - return get_env_var("DATABASE_ID", "database name on AlloyDB instance") - - @pytest_asyncio.fixture - async def engine(self, db_project, db_region, db_cluster, db_instance, db_name): - engine = await AlloyDBEngine.afrom_instance( - project_id=db_project, - cluster=db_cluster, - instance=db_instance, - region=db_region, - database=db_name, - ) - await engine._ainit_vectorstore_table(DEFAULT_TABLE, VECTOR_SIZE) - await engine._ainit_vectorstore_table( - CUSTOM_TABLE, - VECTOR_SIZE, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], - store_metadata=False, - ) - await engine._ainit_vectorstore_table( - CUSTOM_TABLE_WITH_INT_ID, - VECTOR_SIZE, - id_column=Column(name="integer_id", data_type="INTEGER", nullable="False"), - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], - store_metadata=False, - ) - yield engine - await aexecute(engine, f"DROP TABLE IF EXISTS {DEFAULT_TABLE}") - await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_TABLE}") - await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_TABLE_WITH_INT_ID}") - await engine.close() - - async def test_afrom_texts(self, engine): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await AsyncAlloyDBVectorStore.afrom_texts( - texts, - embeddings_service, - engine, - DEFAULT_TABLE, - metadatas=metadatas, - ids=ids, - ) - results = await afetch(engine, f"SELECT * FROM {DEFAULT_TABLE}") - assert len(results) == 3 - await aexecute(engine, f"TRUNCATE TABLE {DEFAULT_TABLE}") - - async def test_afrom_docs(self, engine): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await AsyncAlloyDBVectorStore.afrom_documents( - docs, - embeddings_service, - engine, - DEFAULT_TABLE, - ids=ids, - ) - results = await afetch(engine, f"SELECT * FROM {DEFAULT_TABLE}") - assert len(results) == 3 - await aexecute(engine, f"TRUNCATE TABLE {DEFAULT_TABLE}") - - async def test_afrom_texts_custom(self, engine): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await AsyncAlloyDBVectorStore.afrom_texts( - texts, - embeddings_service, - engine, - CUSTOM_TABLE, - ids=ids, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=["page", "source"], - ) - results = await afetch(engine, f"SELECT * FROM {CUSTOM_TABLE}") - assert len(results) == 3 - assert results[0]["mycontent"] == "foo" - assert results[0]["myembedding"] - assert results[0]["page"] is None - assert results[0]["source"] is None - - async def test_afrom_docs_custom(self, engine): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - docs = [ - Document( - page_content=texts[i], - metadata={"page": str(i), "source": "google.com"}, - ) - for i in range(len(texts)) - ] - await AsyncAlloyDBVectorStore.afrom_documents( - docs, - embeddings_service, - engine, - CUSTOM_TABLE, - ids=ids, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=["page", "source"], - ) - - results = await afetch(engine, f"SELECT * FROM {CUSTOM_TABLE}") - assert len(results) == 3 - assert results[0]["mycontent"] == "foo" - assert results[0]["myembedding"] - assert results[0]["page"] == "0" - assert results[0]["source"] == "google.com" - await aexecute(engine, f"TRUNCATE TABLE {CUSTOM_TABLE}") - - async def test_afrom_docs_custom_with_int_id(self, engine): - ids = [i for i in range(len(texts))] - docs = [ - Document( - page_content=texts[i], - metadata={"page": str(i), "source": "google.com"}, - ) - for i in range(len(texts)) - ] - await AsyncAlloyDBVectorStore.afrom_documents( - docs, - embeddings_service, - engine, - CUSTOM_TABLE_WITH_INT_ID, - ids=ids, - id_column="integer_id", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=["page", "source"], - ) - - results = await afetch(engine, f"SELECT * FROM {CUSTOM_TABLE_WITH_INT_ID}") - assert len(results) == 3 - for row in results: - assert isinstance(row["integer_id"], int) - await aexecute(engine, f"TRUNCATE TABLE {CUSTOM_TABLE_WITH_INT_ID}") diff --git a/tests/test_async_vectorstore_index.py b/tests/test_async_vectorstore_index.py index 317f3559..976305e1 100644 --- a/tests/test_async_vectorstore_index.py +++ b/tests/test_async_vectorstore_index.py @@ -14,26 +14,25 @@ import os -import sys import uuid import pytest import pytest_asyncio from langchain_core.documents import Document from langchain_core.embeddings import DeterministicFakeEmbedding +from langchain_postgres.v2.indexes import DEFAULT_INDEX_NAME_SUFFIX from sqlalchemy import text from langchain_google_alloydb_pg import AlloyDBEngine from langchain_google_alloydb_pg.async_vectorstore import AsyncAlloyDBVectorStore from langchain_google_alloydb_pg.indexes import ( - DEFAULT_INDEX_NAME_SUFFIX, DistanceStrategy, - HNSWIndex, - IVFFlatIndex, + IVFIndex, ) -DEFAULT_TABLE = "test_table" + str(uuid.uuid4()).replace("-", "_") -DEFAULT_INDEX_NAME = DEFAULT_TABLE + DEFAULT_INDEX_NAME_SUFFIX +UUID_STR = str(uuid.uuid4()).replace("-", "_") +DEFAULT_TABLE = "test_table" + UUID_STR +DEFAULT_INDEX_NAME = DEFAULT_INDEX_NAME_SUFFIX + UUID_STR VECTOR_SIZE = 768 embeddings_service = DeterministicFakeEmbedding(size=VECTOR_SIZE) @@ -109,31 +108,14 @@ async def vs(self, engine): await vs.adrop_vector_index() yield vs - async def test_aapply_vector_index(self, vs): - index = HNSWIndex() - await vs.aapply_vector_index(index) - assert await vs.is_valid_index(DEFAULT_INDEX_NAME) - await vs.adrop_vector_index() - - async def test_areindex(self, vs): - if not await vs.is_valid_index(DEFAULT_INDEX_NAME): - index = HNSWIndex() - await vs.aapply_vector_index(index) - await vs.areindex() - await vs.areindex(DEFAULT_INDEX_NAME) - assert await vs.is_valid_index(DEFAULT_INDEX_NAME) - await vs.adrop_vector_index(DEFAULT_INDEX_NAME) - - async def test_dropindex(self, vs): - await vs.adrop_vector_index() - result = await vs.is_valid_index(DEFAULT_INDEX_NAME) - assert not result - - async def test_aapply_vector_index_ivfflat(self, vs): - index = IVFFlatIndex(distance_strategy=DistanceStrategy.EUCLIDEAN) + async def test_aapply_vector_index_ivf(self, vs): + index = IVFIndex( + name=DEFAULT_INDEX_NAME, + distance_strategy=DistanceStrategy.EUCLIDEAN, + ) await vs.aapply_vector_index(index, concurrently=True) assert await vs.is_valid_index(DEFAULT_INDEX_NAME) - index = IVFFlatIndex( + index = IVFIndex( name="secondindex", distance_strategy=DistanceStrategy.INNER_PRODUCT, ) @@ -141,7 +123,3 @@ async def test_aapply_vector_index_ivfflat(self, vs): assert await vs.is_valid_index("secondindex") await vs.adrop_vector_index("secondindex") await vs.adrop_vector_index() - - async def test_is_valid_index(self, vs): - is_valid = await vs.is_valid_index("invalid_index") - assert is_valid == False diff --git a/tests/test_async_vectorstore_search.py b/tests/test_async_vectorstore_search.py index 50946c79..00ae89c0 100644 --- a/tests/test_async_vectorstore_search.py +++ b/tests/test_async_vectorstore_search.py @@ -19,7 +19,7 @@ import pytest_asyncio from langchain_core.documents import Document from langchain_core.embeddings import DeterministicFakeEmbedding -from metadata_filtering_data import FILTERING_TEST_CASES, METADATAS +from langchain_postgres.v2.indexes import HNSWQueryOptions from PIL import Image from sqlalchemy import text @@ -27,14 +27,11 @@ from langchain_google_alloydb_pg.async_vectorstore import AsyncAlloyDBVectorStore from langchain_google_alloydb_pg.indexes import ( DistanceStrategy, - HNSWQueryOptions, ScaNNQueryOptions, ) -DEFAULT_TABLE = "test_table" + str(uuid.uuid4()).replace("-", "_") CUSTOM_TABLE = "test_table_custom" + str(uuid.uuid4()).replace("-", "_") IMAGE_TABLE = "test_image_table" + str(uuid.uuid4()).replace("-", "_") -CUSTOM_FILTER_TABLE = "test_table_custom_filter" + str(uuid.uuid4()).replace("-", "_") VECTOR_SIZE = 768 sync_method_exception_str = "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead." @@ -53,10 +50,6 @@ embeddings = [embeddings_service.embed_query("foo") for i in range(len(texts))] -filter_docs = [ - Document(page_content=texts[i], metadata=METADATAS[i]) for i in range(len(texts)) -] - class FakeImageEmbedding(DeterministicFakeEmbedding): @@ -115,26 +108,12 @@ async def engine(self, db_project, db_region, db_cluster, db_instance, db_name): database=db_name, ) yield engine - await aexecute(engine, f"DROP TABLE IF EXISTS {DEFAULT_TABLE}") await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_TABLE}") - await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_FILTER_TABLE}") + await aexecute(engine, f"DROP TABLE IF EXISTS {IMAGE_TABLE}") await engine.close() @pytest_asyncio.fixture(scope="class") - async def vs(self, engine): - await engine._ainit_vectorstore_table( - DEFAULT_TABLE, VECTOR_SIZE, store_metadata=False - ) - vs = await AsyncAlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=DEFAULT_TABLE, - ) - await vs.aadd_documents(docs, ids=ids) - yield vs - - @pytest_asyncio.fixture(scope="class") - async def vs_custom(self, engine): + async def vs_custom_scann_query_option(self, engine, image_uris): await engine._ainit_vectorstore_table( CUSTOM_TABLE, VECTOR_SIZE, @@ -147,21 +126,6 @@ async def vs_custom(self, engine): ], store_metadata=False, ) - - vs_custom = await AsyncAlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - index_query_options=HNSWQueryOptions(ef_search=1), - ) - await vs_custom.aadd_documents(docs, ids=ids) - yield vs_custom - - @pytest_asyncio.fixture(scope="class") - async def vs_custom_scann_query_option(self, engine, vs_custom): vs_custom_scann_query_option = await AsyncAlloyDBVectorStore.create( engine, embedding_service=embeddings_service, @@ -173,6 +137,7 @@ async def vs_custom_scann_query_option(self, engine, vs_custom): num_leaves_to_search=1, pre_reordering_num_neighbors=2 ), ) + await vs_custom_scann_query_option.aadd_documents(docs, ids=ids) yield vs_custom_scann_query_option @pytest_asyncio.fixture(scope="class") @@ -204,59 +169,15 @@ async def image_vs(self, engine, image_uris): table_name=IMAGE_TABLE, distance_strategy=DistanceStrategy.COSINE_DISTANCE, ) - ids = [str(uuid.uuid4()) for i in range(len(image_uris))] await vs.aadd_images(image_uris, ids=ids) yield vs - @pytest_asyncio.fixture(scope="class") - async def vs_custom_filter(self, engine): - await engine._ainit_vectorstore_table( - CUSTOM_FILTER_TABLE, - VECTOR_SIZE, - metadata_columns=[ - Column("name", "TEXT"), - Column("code", "TEXT"), - Column("price", "FLOAT"), - Column("is_available", "BOOLEAN"), - Column("tags", "TEXT[]"), - Column("inventory_location", "INTEGER[]"), - Column("available_quantity", "INTEGER", nullable=True), - ], - id_column="langchain_id", - store_metadata=False, - ) - - vs_custom_filter = await AsyncAlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_FILTER_TABLE, - metadata_columns=[ - "name", - "code", - "price", - "is_available", - "tags", - "inventory_location", - "available_quantity", - ], - id_column="langchain_id", - ) - await vs_custom_filter.aadd_documents(filter_docs, ids=ids) - yield vs_custom_filter - - async def test_asimilarity_search(self, vs): - results = await vs.asimilarity_search("foo", k=1) - assert len(results) == 1 - assert results == [Document(page_content="foo", id=ids[0])] - results = await vs.asimilarity_search("foo", k=1, filter="content = 'bar'") - assert results == [Document(page_content="bar", id=ids[1])] - async def test_asimilarity_search_scann(self, vs_custom_scann_query_option): results = await vs_custom_scann_query_option.asimilarity_search("foo", k=1) assert len(results) == 1 assert results == [Document(page_content="foo", id=ids[0])] results = await vs_custom_scann_query_option.asimilarity_search( - "foo", k=1, filter="mycontent = 'bar'" + "foo", k=1, filter={"mycontent": "bar"} ) assert results == [Document(page_content="bar", id=ids[1])] @@ -268,171 +189,6 @@ async def test_asimilarity_search_image(self, image_vs, image_uris): assert len(results) == 1 assert results[0].metadata["image_uri"] == image_uris[3] - async def test_asimilarity_search_score(self, vs): - results = await vs.asimilarity_search_with_score("foo") - assert len(results) == 4 - assert results[0][0] == Document(page_content="foo", id=ids[0]) - assert results[0][1] == 0 - - async def test_asimilarity_search_by_vector(self, vs): - embedding = embeddings_service.embed_query("foo") - results = await vs.asimilarity_search_by_vector(embedding) - assert len(results) == 4 - assert results[0] == Document(page_content="foo", id=ids[0]) - results = await vs.asimilarity_search_with_score_by_vector(embedding) - assert results[0][0] == Document(page_content="foo", id=ids[0]) - assert results[0][1] == 0 - - async def test_similarity_search_with_relevance_scores_threshold_cosine(self, vs): - score_threshold = {"score_threshold": 0} - results = await vs.asimilarity_search_with_relevance_scores( - "foo", **score_threshold - ) - # Note: Since tests use FakeEmbeddings which are non-normalized vectors, results might have scores beyond the range [0,1]. - # For a normalized embedding service, a threshold of zero will yield all matched documents. - assert len(results) == 2 - - score_threshold = {"score_threshold": 0.02} - results = await vs.asimilarity_search_with_relevance_scores( - "foo", **score_threshold - ) - assert len(results) == 2 - - score_threshold = {"score_threshold": 0.9} - results = await vs.asimilarity_search_with_relevance_scores( - "foo", **score_threshold - ) - assert len(results) == 1 - assert results[0][0] == Document(page_content="foo", id=ids[0]) - - score_threshold = {"score_threshold": 0.02} - vs.distance_strategy = DistanceStrategy.EUCLIDEAN - results = await vs.asimilarity_search_with_relevance_scores( - "foo", **score_threshold - ) - assert len(results) == 1 - - async def test_similarity_search_with_relevance_scores_threshold_euclidean( - self, engine - ): - vs = await AsyncAlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=DEFAULT_TABLE, - distance_strategy=DistanceStrategy.EUCLIDEAN, - ) - - score_threshold = {"score_threshold": 0.9} - results = await vs.asimilarity_search_with_relevance_scores( - "foo", **score_threshold - ) - assert len(results) == 1 - assert results[0][0] == Document(page_content="foo", id=ids[0]) - - async def test_amax_marginal_relevance_search(self, vs): - results = await vs.amax_marginal_relevance_search("bar") - assert results[0] == Document(page_content="bar", id=ids[1]) - results = await vs.amax_marginal_relevance_search( - "bar", filter="content = 'boo'" - ) - assert results[0] == Document(page_content="boo", id=ids[3]) - - async def test_amax_marginal_relevance_search_vector(self, vs): - embedding = embeddings_service.embed_query("bar") - results = await vs.amax_marginal_relevance_search_by_vector(embedding) - assert results[0] == Document(page_content="bar", id=ids[1]) - - async def test_amax_marginal_relevance_search_vector_score(self, vs): - embedding = embeddings_service.embed_query("bar") - results = await vs.amax_marginal_relevance_search_with_score_by_vector( - embedding - ) - assert results[0][0] == Document(page_content="bar", id=ids[1]) - - results = await vs.amax_marginal_relevance_search_with_score_by_vector( - embedding, lambda_mult=0.75, fetch_k=10 - ) - assert results[0][0] == Document(page_content="bar", id=ids[1]) - - async def test_similarity_search(self, vs_custom): - results = await vs_custom.asimilarity_search("foo", k=1) - assert len(results) == 1 - assert results == [Document(page_content="foo", id=ids[0])] - results = await vs_custom.asimilarity_search( - "foo", k=1, filter="mycontent = 'bar'" - ) - assert results == [Document(page_content="bar", id=ids[1])] - async def test_similarity_search_image(self, image_vs, image_uris): with pytest.raises(NotImplementedError): await image_vs.similarity_search_image(image_uris[0], k=1) - - async def test_similarity_search_score(self, vs_custom): - results = await vs_custom.asimilarity_search_with_score("foo") - assert len(results) == 4 - assert results[0][0] == Document(page_content="foo", id=ids[0]) - assert results[0][1] == 0 - - async def test_similarity_search_by_vector(self, vs_custom): - embedding = embeddings_service.embed_query("foo") - results = await vs_custom.asimilarity_search_by_vector(embedding) - assert len(results) == 4 - assert results[0] == Document(page_content="foo", id=ids[0]) - results = await vs_custom.asimilarity_search_with_score_by_vector(embedding) - assert results[0][0] == Document(page_content="foo", id=ids[0]) - assert results[0][1] == 0 - - async def test_max_marginal_relevance_search(self, vs_custom): - results = await vs_custom.amax_marginal_relevance_search("bar") - assert results[0] == Document(page_content="bar", id=ids[1]) - results = await vs_custom.amax_marginal_relevance_search( - "bar", filter="mycontent = 'boo'" - ) - assert results[0] == Document(page_content="boo", id=ids[3]) - - async def test_max_marginal_relevance_search_vector(self, vs_custom): - embedding = embeddings_service.embed_query("bar") - results = await vs_custom.amax_marginal_relevance_search_by_vector(embedding) - assert results[0] == Document(page_content="bar", id=ids[1]) - - async def test_max_marginal_relevance_search_vector_score(self, vs_custom): - embedding = embeddings_service.embed_query("bar") - results = await vs_custom.amax_marginal_relevance_search_with_score_by_vector( - embedding - ) - assert results[0][0] == Document(page_content="bar", id=ids[1]) - - results = await vs_custom.amax_marginal_relevance_search_with_score_by_vector( - embedding, lambda_mult=0.75, fetch_k=10 - ) - assert results[0][0] == Document(page_content="bar", id=ids[1]) - - async def test_aget_by_ids(self, vs): - test_ids = [ids[0]] - results = await vs.aget_by_ids(ids=test_ids) - - assert results[0] == Document(page_content="foo", id=ids[0]) - - async def test_aget_by_ids_custom_vs(self, vs_custom): - test_ids = [ids[0]] - results = await vs_custom.aget_by_ids(ids=test_ids) - - assert results[0] == Document(page_content="foo", id=ids[0]) - - def test_get_by_ids(self, vs): - test_ids = [ids[0]] - with pytest.raises(Exception, match=sync_method_exception_str): - vs.get_by_ids(ids=test_ids) - - @pytest.mark.parametrize("test_filter, expected_ids", FILTERING_TEST_CASES) - async def test_vectorstore_with_metadata_filters( - self, - vs_custom_filter, - test_filter, - expected_ids, - ): - """Test end to end construction and search.""" - docs = await vs_custom_filter.asimilarity_search( - "meow", k=5, filter=test_filter - ) - assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter diff --git a/tests/test_engine.py b/tests/test_engine.py index bbbb4c86..3650fd46 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -245,6 +245,28 @@ async def getconn() -> asyncpg.Connection: await aexecute(engine, "SELECT 1") await engine.close() + async def test_from_connection_string( + self, + db_name, + user, + password, + ): + port = "5432" + url = f"postgresql+asyncpg://{user}:{password}@{host}:{port}/{db_name}" + engine = AlloyDBEngine.from_connection_string( + url, + echo=True, + poolclass=NullPool, + ) + await aexecute(engine, "SELECT 1") + await engine.close() + + engine = AlloyDBEngine.from_connection_string( + URL.create("postgresql+asyncpg", user, password, host, port, db_name) + ) + await aexecute(engine, "SELECT 1") + await engine.close() + async def test_from_engine_args_url( self, db_name, diff --git a/tests/test_indexes.py b/tests/test_indexes.py index a441058c..f3675eee 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -16,10 +16,6 @@ from langchain_google_alloydb_pg.indexes import ( DistanceStrategy, - HNSWIndex, - HNSWQueryOptions, - IVFFlatIndex, - IVFFlatQueryOptions, IVFIndex, IVFQueryOptions, ScaNNIndex, @@ -48,42 +44,6 @@ def test_distance_strategy(self): scann_index = ScaNNIndex(distance_strategy=DistanceStrategy.INNER_PRODUCT) assert scann_index.get_index_function() == "dot_prod" - def test_hnsw_index(self): - index = HNSWIndex(name="test_index", m=32, ef_construction=128) - assert index.index_type == "hnsw" - assert index.m == 32 - assert index.ef_construction == 128 - assert index.index_options() == "(m = 32, ef_construction = 128)" - - def test_hnsw_query_options(self): - options = HNSWQueryOptions(ef_search=80) - assert options.to_parameter() == ["hnsw.ef_search = 80"] - - with warnings.catch_warnings(record=True) as w: - options.to_string() - - assert len(w) == 1 - assert "to_string is deprecated, use to_parameter instead." in str( - w[-1].message - ) - - def test_ivfflat_index(self): - index = IVFFlatIndex(name="test_index", lists=200) - assert index.index_type == "ivfflat" - assert index.lists == 200 - assert index.index_options() == "(lists = 200)" - - def test_ivfflat_query_options(self): - options = IVFFlatQueryOptions(probes=2) - assert options.to_parameter() == ["ivfflat.probes = 2"] - - with warnings.catch_warnings(record=True) as w: - options.to_string() - assert len(w) == 1 - assert "to_string is deprecated, use to_parameter instead." in str( - w[-1].message - ) - def test_ivf_index(self): index = IVFIndex(name="test_index", lists=200) assert index.index_type == "ivf" diff --git a/tests/test_standard_test_suite.py b/tests/test_standard_test_suite.py index db2765b9..580bfaff 100644 --- a/tests/test_standard_test_suite.py +++ b/tests/test_standard_test_suite.py @@ -23,8 +23,8 @@ from langchain_google_alloydb_pg import AlloyDBEngine, AlloyDBVectorStore, Column -DEFAULT_TABLE = "test_table_standard_test_suite" + str(uuid.uuid4()) -DEFAULT_TABLE_SYNC = "test_table_sync_standard_test_suite" + str(uuid.uuid4()) +DEFAULT_TABLE = "test_table" + str(uuid.uuid4()) +DEFAULT_TABLE_SYNC = "test_table_sync" + str(uuid.uuid4()) def get_env_var(key: str, desc: str) -> str: diff --git a/tests/test_vectorstore.py b/tests/test_vectorstore.py index 3392a14b..4bd80538 100644 --- a/tests/test_vectorstore.py +++ b/tests/test_vectorstore.py @@ -20,19 +20,14 @@ import pytest import pytest_asyncio -from google.cloud.alloydb.connector import AsyncConnector, IPTypes from langchain_core.documents import Document from langchain_core.embeddings import DeterministicFakeEmbedding from PIL import Image from sqlalchemy import text from sqlalchemy.engine.row import RowMapping -from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine from langchain_google_alloydb_pg import AlloyDBEngine, AlloyDBVectorStore, Column -DEFAULT_TABLE = "test_table" + str(uuid.uuid4()) -DEFAULT_TABLE_SYNC = "test_table_sync" + str(uuid.uuid4()) -CUSTOM_TABLE = "test-table-custom" + str(uuid.uuid4()) IMAGE_TABLE = "test_image_table" + str(uuid.uuid4()) IMAGE_TABLE_SYNC = "test_image_table_sync" + str(uuid.uuid4()) VECTOR_SIZE = 768 @@ -129,20 +124,10 @@ async def engine(self, db_project, db_region, db_cluster, db_instance, db_name): ) yield engine - await aexecute(engine, f'DROP TABLE IF EXISTS "{DEFAULT_TABLE}"') + await aexecute(engine, f'DROP TABLE IF EXISTS "{IMAGE_TABLE}"') await engine.close() await engine._connector.close() - @pytest_asyncio.fixture(scope="class") - async def vs(self, engine): - await engine.ainit_vectorstore_table(DEFAULT_TABLE, VECTOR_SIZE) - vs = await AlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=DEFAULT_TABLE, - ) - yield vs - @pytest_asyncio.fixture(scope="class") async def engine_sync( self, db_project, db_region, db_cluster, db_instance, db_name @@ -156,44 +141,9 @@ async def engine_sync( ) yield engine_sync - await aexecute(engine_sync, f'DROP TABLE IF EXISTS "{DEFAULT_TABLE_SYNC}"') + await aexecute(engine_sync, f'DROP TABLE IF EXISTS "{IMAGE_TABLE_SYNC}"') await engine_sync.close() - @pytest_asyncio.fixture(scope="class") - def vs_sync(self, engine_sync): - engine_sync.init_vectorstore_table(DEFAULT_TABLE_SYNC, VECTOR_SIZE) - - vs = AlloyDBVectorStore.create_sync( - engine_sync, - embedding_service=embeddings_service, - table_name=DEFAULT_TABLE_SYNC, - ) - yield vs - - @pytest_asyncio.fixture(scope="class") - async def vs_custom(self, engine): - await engine.ainit_vectorstore_table( - CUSTOM_TABLE, - VECTOR_SIZE, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], - metadata_json_column="mymeta", - ) - vs = await AlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=["page", "source"], - metadata_json_column="mymeta", - ) - yield vs - await aexecute(engine, f'DROP TABLE IF EXISTS "{CUSTOM_TABLE}"') - @pytest_asyncio.fixture(scope="class") def image_uris(self): red_uri = str(uuid.uuid4()).replace("-", "_") + "test_image_red.jpg" @@ -214,125 +164,6 @@ def image_uris(self): except FileNotFoundError: pass - async def test_init_with_constructor(self, engine): - with pytest.raises(Exception): - AlloyDBVectorStore( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="noname", - embedding_column="myembedding", - metadata_columns=["page", "source"], - metadata_json_column="mymeta", - ) - - async def test_post_init(self, engine): - with pytest.raises(ValueError): - await AlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="noname", - embedding_column="myembedding", - metadata_columns=["page", "source"], - metadata_json_column="mymeta", - ) - - async def test_aadd_texts(self, engine, vs): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs.aadd_texts(texts, ids=ids) - results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') - assert len(results) == 3 - - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs.aadd_texts(texts, metadatas, ids) - results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') - assert len(results) == 6 - await aexecute(engine, f'TRUNCATE TABLE "{DEFAULT_TABLE}"') - - async def test_cross_env_add_texts(self, engine, vs): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - vs.add_texts(texts, ids=ids) - results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') - assert len(results) == 3 - vs.delete(ids) - await aexecute(engine, f'TRUNCATE TABLE "{DEFAULT_TABLE}"') - - async def test_aadd_texts_edge_cases(self, engine, vs): - texts = ["Taylor's", '"Swift"', "best-friend"] - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs.aadd_texts(texts, ids=ids) - results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') - assert len(results) == 3 - await aexecute(engine, f'TRUNCATE TABLE "{DEFAULT_TABLE}"') - - async def test_aadd_docs(self, engine, vs): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs.aadd_documents(docs, ids=ids) - results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') - assert len(results) == 3 - await aexecute(engine, f'TRUNCATE TABLE "{DEFAULT_TABLE}"') - - async def test_aadd_embeddings(self, engine, vs_custom): - await vs_custom.aadd_embeddings( - texts=texts, embeddings=embeddings, metadatas=metadatas - ) - results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') - assert len(results) == 3 - assert results[0]["mycontent"] == "foo" - assert results[0]["myembedding"] - assert results[0]["page"] == "0" - assert results[0]["source"] == "google.com" - await aexecute(engine, f'TRUNCATE TABLE "{CUSTOM_TABLE}"') - - async def test_adelete(self, engine, vs): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs.aadd_texts(texts, ids=ids) - results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') - assert len(results) == 3 - # delete an ID - await vs.adelete([ids[0]]) - results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') - assert len(results) == 2 - await aexecute(engine, f'TRUNCATE TABLE "{DEFAULT_TABLE}"') - - async def test_aadd_texts_custom(self, engine, vs_custom): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs_custom.aadd_texts(texts, ids=ids) - results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') - assert len(results) == 3 - assert results[0]["mycontent"] == "foo" - assert results[0]["myembedding"] - assert results[0]["page"] is None - assert results[0]["source"] is None - - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs_custom.aadd_texts(texts, metadatas, ids) - results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') - assert len(results) == 6 - await aexecute(engine, f'TRUNCATE TABLE "{CUSTOM_TABLE}"') - - async def test_aadd_docs_custom(self, engine, vs_custom): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - docs = [ - Document( - page_content=texts[i], - metadata={"page": str(i), "source": "google.com"}, - ) - for i in range(len(texts)) - ] - await vs_custom.aadd_documents(docs, ids=ids) - - results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') - assert len(results) == 3 - assert results[0]["mycontent"] == "foo" - assert results[0]["myembedding"] - assert results[0]["page"] == "0" - assert results[0]["source"] == "google.com" - await aexecute(engine, f'TRUNCATE TABLE "{CUSTOM_TABLE}"') - async def test_aadd_images(self, engine_sync, image_uris): engine_sync.init_vectorstore_table( IMAGE_TABLE, @@ -360,37 +191,6 @@ async def test_aadd_images(self, engine_sync, image_uris): assert results[0]["source"] == "google.com" await aexecute(engine_sync, f'TRUNCATE TABLE "{IMAGE_TABLE}"') - async def test_adelete_custom(self, engine, vs_custom): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs_custom.aadd_texts(texts, ids=ids) - results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') - content = [result["mycontent"] for result in results] - assert len(results) == 3 - assert "foo" in content - # delete an ID - await vs_custom.adelete([ids[0]]) - results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') - content = [result["mycontent"] for result in results] - assert len(results) == 2 - assert "foo" not in content - await aexecute(engine, f'TRUNCATE TABLE "{CUSTOM_TABLE}"') - - async def test_add_docs(self, engine_sync, vs_sync): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - vs_sync.add_documents(docs, ids=ids) - results = await afetch(engine_sync, f'SELECT * FROM "{DEFAULT_TABLE_SYNC}"') - assert len(results) == 3 - vs_sync.delete(ids) - await aexecute(engine_sync, f'TRUNCATE TABLE "{DEFAULT_TABLE_SYNC}"') - - async def test_add_texts(self, engine_sync, vs_sync): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - vs_sync.add_texts(texts, ids=ids) - results = await afetch(engine_sync, f'SELECT * FROM "{DEFAULT_TABLE_SYNC}"') - assert len(results) == 3 - await vs_sync.adelete(ids) - await aexecute(engine_sync, f'TRUNCATE TABLE "{DEFAULT_TABLE_SYNC}"') - async def test_add_images(self, engine_sync, image_uris): engine_sync.init_vectorstore_table(IMAGE_TABLE_SYNC, VECTOR_SIZE) vs = AlloyDBVectorStore.create_sync( @@ -405,262 +205,3 @@ async def test_add_images(self, engine_sync, image_uris): assert len(results) == len(image_uris) await vs.adelete(ids) await aexecute(engine_sync, f'DROP TABLE IF EXISTS "{IMAGE_TABLE_SYNC}"') - - async def test_cross_env(self, engine_sync, vs_sync): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs_sync.aadd_texts(texts, ids=ids) - results = await afetch(engine_sync, f'SELECT * FROM "{DEFAULT_TABLE_SYNC}"') - assert len(results) == 3 - await vs_sync.adelete(ids) - await aexecute(engine_sync, f'TRUNCATE TABLE "{DEFAULT_TABLE_SYNC}"') - - async def test_add_embeddings(self, engine_sync, vs_custom): - vs_custom.add_embeddings( - texts=texts, - embeddings=embeddings, - metadatas=[ - {"page": str(i), "source": "google.com"} for i in range(len(texts)) - ], - ) - results = await afetch(engine_sync, f'SELECT * FROM "{CUSTOM_TABLE}"') - assert len(results) == 3 - assert results[0]["mycontent"] == "foo" - assert results[0]["myembedding"] - assert results[0]["page"] == "0" - assert results[0]["source"] == "google.com" - await aexecute(engine_sync, f'TRUNCATE TABLE "{CUSTOM_TABLE}"') - - async def test_create_vectorstore_with_invalid_parameters(self, engine): - with pytest.raises(ValueError): - await AlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=["random_column"], # invalid metadata column - ) - with pytest.raises(ValueError): - await AlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="langchain_id", # invalid content column type - embedding_column="myembedding", - metadata_columns=["random_column"], - ) - with pytest.raises(ValueError): - await AlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="mycontent", - embedding_column="random_column", # invalid embedding column - metadata_columns=["random_column"], - ) - with pytest.raises(ValueError): - await AlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="mycontent", - embedding_column="langchain_id", # invalid embedding column data type - metadata_columns=["random_column"], - ) - - async def test_from_engine( - self, - db_project, - db_region, - db_cluster, - db_instance, - db_name, - user, - password, - ): - async with AsyncConnector() as connector: - - async def getconn(): - conn = await connector.connect( # type: ignore - f"projects/{db_project}/locations/{db_region}/clusters/{db_cluster}/instances/{db_instance}", - "asyncpg", - user=user, - password=password, - db=db_name, - enable_iam_auth=False, - ip_type=IPTypes.PUBLIC, - ) - return conn - - engine = create_async_engine( - "postgresql+asyncpg://", - async_creator=getconn, - ) - - engine = AlloyDBEngine.from_engine(engine) - table_name = "test_table" + str(uuid.uuid4()).replace("-", "_") - await engine.ainit_vectorstore_table(table_name, VECTOR_SIZE) - vs = await AlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=table_name, - ) - await vs.aadd_texts(["foo"]) - results = await afetch(engine, f"SELECT * FROM {table_name}") - assert len(results) == 1 - - await aexecute(engine, f"DROP TABLE {table_name}") - await engine.close() - await engine._connector.close() - - async def test_from_engine_loop_connector( - self, - db_project, - db_region, - db_cluster, - db_instance, - db_name, - user, - password, - ): - async def init_connection_pool( - # connector: AsyncConnector, - ) -> AsyncEngine: - connector = AsyncConnector() - - async def getconn(): - conn = await connector.connect( - f"projects/{db_project}/locations/{db_region}/clusters/{db_cluster}/instances/{db_instance}", - "asyncpg", - user=user, - password=password, - db=db_name, - enable_iam_auth=False, - ip_type="PUBLIC", - ) - return conn - - pool = create_async_engine( - "postgresql+asyncpg://", - async_creator=getconn, - ) - return pool - - loop = asyncio.new_event_loop() - thread = Thread(target=loop.run_forever, daemon=True) - thread.start() - - # connector = AsyncConnector() - # coro = init_connection_pool(connector) - coro = init_connection_pool() - pool = asyncio.run_coroutine_threadsafe(coro, loop).result() - engine = AlloyDBEngine.from_engine(pool, loop) - table_name = "test_table" + str(uuid.uuid4()).replace("-", "_") - await engine.ainit_vectorstore_table(table_name, VECTOR_SIZE) - vs = await AlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=table_name, - ) - await vs.aadd_texts(["foo"]) - vs.add_texts(["foo"]) - results = await afetch(engine, f"SELECT * FROM {table_name}") - assert len(results) == 2 - - await aexecute(engine, f"TRUNCATE TABLE {table_name}") - await engine.close() - - vs = AlloyDBVectorStore.create_sync( - engine, - embedding_service=embeddings_service, - table_name=table_name, - ) - await vs.aadd_texts(["foo"]) - vs.add_texts(["foo"]) - results = await afetch(engine, f"SELECT * FROM {table_name}") - assert len(results) == 2 - - await aexecute(engine, f"DROP TABLE {table_name}") - - async def test_from_engine_args_url( - self, - db_name, - user, - password, - ): - port = "5432" - url = f"postgresql+asyncpg://{user}:{password}@{host}:{port}/{db_name}" - engine = AlloyDBEngine.from_engine_args(url) - table_name = "test_table" + str(uuid.uuid4()).replace("-", "_") - await engine.ainit_vectorstore_table(table_name, VECTOR_SIZE) - vs = await AlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=table_name, - ) - await vs.aadd_texts(["foo"]) - vs.add_texts(["foo"]) - results = await afetch(engine, f"SELECT * FROM {table_name}") - assert len(results) == 2 - - await aexecute(engine, f"TRUNCATE TABLE {table_name}") - vs = AlloyDBVectorStore.create_sync( - engine, - embedding_service=embeddings_service, - table_name=table_name, - ) - await vs.aadd_texts(["foo"]) - vs.add_texts(["bar"]) - results = await afetch(engine, f"SELECT * FROM {table_name}") - assert len(results) == 2 - await aexecute(engine, f"DROP TABLE {table_name}") - await engine.close() - await engine._connector.close() - - async def test_from_engine_loop( - self, - db_name, - user, - password, - ): - port = "5432" - url = f"postgresql+asyncpg://{user}:{password}@{host}:{port}/{db_name}" - - loop = asyncio.new_event_loop() - thread = Thread(target=loop.run_forever, daemon=True) - thread.start() - pool = create_async_engine(url) - engine = AlloyDBEngine.from_engine(pool, loop) - - table_name = "test_table" + str(uuid.uuid4()).replace("-", "_") - await engine.ainit_vectorstore_table(table_name, VECTOR_SIZE) - vs = await AlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=table_name, - ) - await vs.aadd_texts(["foo"]) - vs.add_texts(["foo"]) - results = await afetch(engine, f"SELECT * FROM {table_name}") - assert len(results) == 2 - - await aexecute(engine, f"TRUNCATE TABLE {table_name}") - vs = AlloyDBVectorStore.create_sync( - engine, - embedding_service=embeddings_service, - table_name=table_name, - ) - await vs.aadd_texts(["foo"]) - vs.add_texts(["bar"]) - results = await afetch(engine, f"SELECT * FROM {table_name}") - assert len(results) == 2 - await aexecute(engine, f"DROP TABLE {table_name}") - await engine.close() - await engine._connector.close() - - def test_get_table_name(self, vs): - assert vs.get_table_name() == DEFAULT_TABLE diff --git a/tests/test_vectorstore_embeddings.py b/tests/test_vectorstore_embeddings.py index 5afc3ada..a2fed27f 100644 --- a/tests/test_vectorstore_embeddings.py +++ b/tests/test_vectorstore_embeddings.py @@ -18,6 +18,7 @@ import pytest import pytest_asyncio from langchain_core.documents import Document +from langchain_postgres.v2.indexes import DistanceStrategy, HNSWQueryOptions from sqlalchemy import text from langchain_google_alloydb_pg import ( @@ -26,7 +27,6 @@ AlloyDBVectorStore, Column, ) -from langchain_google_alloydb_pg.indexes import DistanceStrategy, HNSWQueryOptions DEFAULT_TABLE = "test_table" + str(uuid.uuid4()).replace("-", "_") DEFAULT_TABLE_SYNC = "test_table" + str(uuid.uuid4()).replace("-", "_") @@ -172,7 +172,7 @@ async def test_asimilarity_search(self, vs): results = await vs.asimilarity_search("foo", k=1) assert len(results) == 1 assert results == [Document(page_content="foo", id=ids[0])] - results = await vs.asimilarity_search("foo", k=1, filter="content = 'bar'") + results = await vs.asimilarity_search("foo", k=1, filter={"content": "bar"}) assert results == [Document(page_content="bar", id=ids[1])] async def test_asimilarity_search_score(self, vs): @@ -231,7 +231,7 @@ async def test_amax_marginal_relevance_search(self, vs): results = await vs.amax_marginal_relevance_search("bar") assert results[0] == Document(page_content="bar", id=ids[1]) results = await vs.amax_marginal_relevance_search( - "bar", filter="content = 'boo'" + "bar", filter={"content": "boo"} ) assert results[0] == Document(page_content="boo", id=ids[3]) @@ -331,7 +331,7 @@ def test_similarity_search(self, vs_custom): results = vs_custom.similarity_search("foo", k=1) assert len(results) == 1 assert results == [Document(page_content="foo", id=ids[0])] - results = vs_custom.similarity_search("foo", k=1, filter="mycontent = 'bar'") + results = vs_custom.similarity_search("foo", k=1, filter={"mycontent": "bar"}) assert results == [Document(page_content="bar", id=ids[1])] def test_similarity_search_score(self, vs_custom): @@ -353,7 +353,7 @@ def test_max_marginal_relevance_search(self, vs_custom): results = vs_custom.max_marginal_relevance_search("bar") assert results[0] == Document(page_content="bar", id=ids[1]) results = vs_custom.max_marginal_relevance_search( - "bar", filter="mycontent = 'boo'" + "bar", filter={"mycontent": "boo"} ) assert results[0] == Document(page_content="boo", id=ids[3]) diff --git a/tests/test_vectorstore_from_methods.py b/tests/test_vectorstore_from_methods.py deleted file mode 100644 index 975620af..00000000 --- a/tests/test_vectorstore_from_methods.py +++ /dev/null @@ -1,333 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import uuid -from typing import Sequence - -import pytest -import pytest_asyncio -from langchain_core.documents import Document -from langchain_core.embeddings import DeterministicFakeEmbedding -from sqlalchemy import VARCHAR, text -from sqlalchemy.engine.row import RowMapping -from sqlalchemy.ext.asyncio import create_async_engine - -from langchain_google_alloydb_pg import AlloyDBEngine, AlloyDBVectorStore, Column - -DEFAULT_TABLE = "test_table" + str(uuid.uuid4()).replace("-", "_") -DEFAULT_TABLE_SYNC = "test_table_sync" + str(uuid.uuid4()).replace("-", "_") -CUSTOM_TABLE = "test_table_custom" + str(uuid.uuid4()).replace("-", "_") -CUSTOM_TABLE_WITH_INT_ID = "test_table_with_int_id" + str(uuid.uuid4()).replace( - "-", "_" -) -CUSTOM_TABLE_WITH_INT_ID_SYNC = "test_table_with_int_id" + str(uuid.uuid4()).replace( - "-", "_" -) -VECTOR_SIZE = 768 - - -embeddings_service = DeterministicFakeEmbedding(size=VECTOR_SIZE) - -texts = ["foo", "bar", "baz"] -metadatas = [{"page": str(i), "source": "google.com"} for i in range(len(texts))] -docs = [ - Document(page_content=texts[i], metadata=metadatas[i]) for i in range(len(texts)) -] - -embeddings = [embeddings_service.embed_query(texts[i]) for i in range(len(texts))] - - -def get_env_var(key: str, desc: str) -> str: - v = os.environ.get(key) - if v is None: - raise ValueError(f"Must set env var {key} to: {desc}") - return v - - -async def aexecute( - engine: AlloyDBEngine, - query: str, -) -> None: - async def run(engine, query): - async with engine._pool.connect() as conn: - await conn.execute(text(query)) - await conn.commit() - - await engine._run_as_async(run(engine, query)) - - -async def afetch(engine: AlloyDBEngine, query: str) -> Sequence[RowMapping]: - async def run(engine, query): - async with engine._pool.connect() as conn: - result = await conn.execute(text(query)) - result_map = result.mappings() - result_fetch = result_map.fetchall() - return result_fetch - - return await engine._run_as_async(run(engine, query)) - - -@pytest.mark.asyncio -class TestVectorStoreFromMethods: - @pytest.fixture(scope="module") - def db_project(self) -> str: - return get_env_var("PROJECT_ID", "project id for google cloud") - - @pytest.fixture(scope="module") - def db_region(self) -> str: - return get_env_var("REGION", "region for AlloyDB instance") - - @pytest.fixture(scope="module") - def db_cluster(self) -> str: - return get_env_var("CLUSTER_ID", "cluster for AlloyDB") - - @pytest.fixture(scope="module") - def db_instance(self) -> str: - return get_env_var("INSTANCE_ID", "instance for AlloyDB") - - @pytest.fixture(scope="module") - def db_name(self) -> str: - return get_env_var("DATABASE_ID", "database name on AlloyDB instance") - - @pytest_asyncio.fixture - async def engine(self, db_project, db_region, db_cluster, db_instance, db_name): - engine = await AlloyDBEngine.afrom_instance( - project_id=db_project, - cluster=db_cluster, - instance=db_instance, - region=db_region, - database=db_name, - ) - await engine.ainit_vectorstore_table(DEFAULT_TABLE, VECTOR_SIZE) - await engine.ainit_vectorstore_table( - CUSTOM_TABLE, - VECTOR_SIZE, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], - store_metadata=False, - ) - await engine.ainit_vectorstore_table( - CUSTOM_TABLE_WITH_INT_ID, - VECTOR_SIZE, - id_column=Column(name="integer_id", data_type="INTEGER", nullable="False"), - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], - store_metadata=False, - ) - yield engine - await aexecute(engine, f"DROP TABLE IF EXISTS {DEFAULT_TABLE}") - await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_TABLE}") - await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_TABLE_WITH_INT_ID}") - await engine.close() - - @pytest_asyncio.fixture - async def engine_sync( - self, db_project, db_region, db_cluster, db_instance, db_name - ): - engine = AlloyDBEngine.from_instance( - project_id=db_project, - cluster=db_cluster, - instance=db_instance, - region=db_region, - database=db_name, - ) - engine.init_vectorstore_table(DEFAULT_TABLE_SYNC, VECTOR_SIZE) - engine.init_vectorstore_table( - CUSTOM_TABLE_WITH_INT_ID_SYNC, - VECTOR_SIZE, - id_column=Column(name="integer_id", data_type="INTEGER", nullable="False"), - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], - store_metadata=False, - ) - - yield engine - await aexecute(engine, f"DROP TABLE IF EXISTS {DEFAULT_TABLE_SYNC}") - await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_TABLE_WITH_INT_ID_SYNC}") - await engine.close() - - async def test_afrom_texts(self, engine): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await AlloyDBVectorStore.afrom_texts( - texts, - embeddings_service, - engine, - DEFAULT_TABLE, - metadatas=metadatas, - ids=ids, - ) - results = await afetch(engine, f"SELECT * FROM {DEFAULT_TABLE}") - assert len(results) == 3 - await aexecute(engine, f"TRUNCATE TABLE {DEFAULT_TABLE}") - - async def test_from_texts(self, engine_sync): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - AlloyDBVectorStore.from_texts( - texts, - embeddings_service, - engine_sync, - DEFAULT_TABLE_SYNC, - metadatas=metadatas, - ids=ids, - ) - results = await afetch(engine_sync, f"SELECT * FROM {DEFAULT_TABLE_SYNC}") - assert len(results) == 3 - await aexecute(engine_sync, f"TRUNCATE TABLE {DEFAULT_TABLE_SYNC}") - - async def test_afrom_docs(self, engine): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await AlloyDBVectorStore.afrom_documents( - docs, - embeddings_service, - engine, - DEFAULT_TABLE, - ids=ids, - ) - results = await afetch(engine, f"SELECT * FROM {DEFAULT_TABLE}") - assert len(results) == 3 - await aexecute(engine, f"TRUNCATE TABLE {DEFAULT_TABLE}") - - async def test_from_docs(self, engine_sync): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - AlloyDBVectorStore.from_documents( - docs, - embeddings_service, - engine_sync, - DEFAULT_TABLE_SYNC, - ids=ids, - ) - results = await afetch(engine_sync, f"SELECT * FROM {DEFAULT_TABLE_SYNC}") - assert len(results) == 3 - await aexecute(engine_sync, f"TRUNCATE TABLE {DEFAULT_TABLE_SYNC}") - - async def test_afrom_docs_cross_env(self, engine_sync): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await AlloyDBVectorStore.afrom_documents( - docs, - embeddings_service, - engine_sync, - DEFAULT_TABLE_SYNC, - ids=ids, - ) - results = await afetch(engine_sync, f"SELECT * FROM {DEFAULT_TABLE_SYNC}") - assert len(results) == 3 - await aexecute(engine_sync, f"TRUNCATE TABLE {DEFAULT_TABLE_SYNC}") - - async def test_from_docs_cross_env(self, engine, engine_sync): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - AlloyDBVectorStore.from_documents( - docs, - embeddings_service, - engine, - DEFAULT_TABLE_SYNC, - ids=ids, - ) - results = await afetch(engine, f"SELECT * FROM {DEFAULT_TABLE_SYNC}") - assert len(results) == 3 - await aexecute(engine, f"TRUNCATE TABLE {DEFAULT_TABLE_SYNC}") - - async def test_afrom_texts_custom(self, engine): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await AlloyDBVectorStore.afrom_texts( - texts, - embeddings_service, - engine, - CUSTOM_TABLE, - ids=ids, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=["page", "source"], - ) - results = await afetch(engine, f"SELECT * FROM {CUSTOM_TABLE}") - assert len(results) == 3 - assert results[0]["mycontent"] == "foo" - assert results[0]["myembedding"] - assert results[0]["page"] is None - assert results[0]["source"] is None - - async def test_afrom_docs_custom(self, engine): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - docs = [ - Document( - page_content=texts[i], - metadata={"page": str(i), "source": "google.com"}, - ) - for i in range(len(texts)) - ] - await AlloyDBVectorStore.afrom_documents( - docs, - embeddings_service, - engine, - CUSTOM_TABLE, - ids=ids, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=["page", "source"], - ) - - results = await afetch(engine, f"SELECT * FROM {CUSTOM_TABLE}") - assert len(results) == 3 - assert results[0]["mycontent"] == "foo" - assert results[0]["myembedding"] - assert results[0]["page"] == "0" - assert results[0]["source"] == "google.com" - await aexecute(engine, f"TRUNCATE TABLE {CUSTOM_TABLE}") - - async def test_afrom_texts_custom_with_int_id(self, engine): - ids = [i for i in range(len(texts))] - await AlloyDBVectorStore.afrom_texts( - texts, - embeddings_service, - engine, - CUSTOM_TABLE_WITH_INT_ID, - metadatas=metadatas, - ids=ids, - id_column="integer_id", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=["page", "source"], - ) - results = await afetch(engine, f"SELECT * FROM {CUSTOM_TABLE_WITH_INT_ID}") - assert len(results) == 3 - for row in results: - assert isinstance(row["integer_id"], int) - await aexecute(engine, f"TRUNCATE TABLE {CUSTOM_TABLE_WITH_INT_ID}") - - async def test_from_texts_custom_with_int_id(self, engine_sync): - ids = [i for i in range(len(texts))] - AlloyDBVectorStore.from_texts( - texts, - embeddings_service, - engine_sync, - CUSTOM_TABLE_WITH_INT_ID_SYNC, - ids=ids, - id_column="integer_id", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=["page", "source"], - ) - results = await afetch( - engine_sync, f"SELECT * FROM {CUSTOM_TABLE_WITH_INT_ID_SYNC}" - ) - assert len(results) == 3 - for row in results: - assert isinstance(row["integer_id"], int) - await aexecute(engine_sync, f"TRUNCATE TABLE {CUSTOM_TABLE_WITH_INT_ID_SYNC}") diff --git a/tests/test_vectorstore_index.py b/tests/test_vectorstore_index.py index c63c464f..5af2085b 100644 --- a/tests/test_vectorstore_index.py +++ b/tests/test_vectorstore_index.py @@ -14,7 +14,6 @@ import os -import sys import uuid import pytest @@ -22,25 +21,25 @@ import sqlalchemy from langchain_core.documents import Document from langchain_core.embeddings import DeterministicFakeEmbedding +from langchain_postgres.v2.indexes import DEFAULT_INDEX_NAME_SUFFIX from sqlalchemy import text from langchain_google_alloydb_pg import AlloyDBEngine, AlloyDBVectorStore from langchain_google_alloydb_pg.indexes import ( - DEFAULT_INDEX_NAME_SUFFIX, DistanceStrategy, - HNSWIndex, - IVFFlatIndex, IVFIndex, ScaNNIndex, ) -DEFAULT_TABLE = "test_table" + str(uuid.uuid4()).replace("-", "_") -DEFAULT_TABLE_ASYNC = "test_table" + str(uuid.uuid4()).replace("-", "_") -DEFAULT_TABLE_OMNI = "test_table" + str(uuid.uuid4()).replace("-", "_") -CUSTOM_TABLE = "test_table_custom" + str(uuid.uuid4()).replace("-", "_") -DEFAULT_INDEX_NAME = DEFAULT_TABLE + DEFAULT_INDEX_NAME_SUFFIX -DEFAULT_INDEX_NAME_ASYNC = DEFAULT_TABLE_ASYNC + DEFAULT_INDEX_NAME_SUFFIX -DEFAULT_INDEX_NAME_OMNI = DEFAULT_TABLE_OMNI + DEFAULT_INDEX_NAME_SUFFIX +DEFAULT_TABLE_UUID = str(uuid.uuid4()).replace("-", "_") +DEFAULT_TABLE_ASYNC_UUID = str(uuid.uuid4()).replace("-", "_") +OMNI_UUID = str(uuid.uuid4()).replace("-", "_") +DEFAULT_TABLE = "test_table" + DEFAULT_TABLE_UUID +DEFAULT_TABLE_ASYNC = "test_table" + DEFAULT_TABLE_ASYNC_UUID +DEFAULT_TABLE_OMNI = "test_table" + OMNI_UUID +DEFAULT_INDEX_NAME = DEFAULT_INDEX_NAME_SUFFIX + DEFAULT_TABLE_UUID +DEFAULT_INDEX_NAME_ASYNC = DEFAULT_INDEX_NAME_SUFFIX + DEFAULT_TABLE_ASYNC_UUID +DEFAULT_INDEX_NAME_OMNI = DEFAULT_INDEX_NAME_SUFFIX + OMNI_UUID VECTOR_SIZE = 768 embeddings_service = DeterministicFakeEmbedding(size=VECTOR_SIZE) @@ -122,42 +121,20 @@ async def vs(self, engine): vs.drop_vector_index() yield vs - async def test_aapply_vector_index(self, vs): - index = HNSWIndex() - vs.apply_vector_index(index) - assert vs.is_valid_index(DEFAULT_INDEX_NAME) - vs.drop_vector_index() - - async def test_areindex(self, vs): - if not vs.is_valid_index(DEFAULT_INDEX_NAME): - index = HNSWIndex() - vs.apply_vector_index(index) - vs.reindex() - vs.reindex(DEFAULT_INDEX_NAME) - assert vs.is_valid_index(DEFAULT_INDEX_NAME) - vs.drop_vector_index(DEFAULT_INDEX_NAME) - - async def test_dropindex(self, vs): - vs.drop_vector_index() - result = vs.is_valid_index(DEFAULT_INDEX_NAME) - assert not result - - async def test_aapply_vector_index_ivfflat(self, vs): - index = IVFFlatIndex(distance_strategy=DistanceStrategy.EUCLIDEAN) + async def test_aapply_vector_index_ivf(self, vs): + index = IVFIndex( + name=DEFAULT_INDEX_NAME, distance_strategy=DistanceStrategy.EUCLIDEAN + ) vs.apply_vector_index(index, concurrently=True) assert vs.is_valid_index(DEFAULT_INDEX_NAME) - index = IVFFlatIndex( + index = IVFIndex( name="secondindex", distance_strategy=DistanceStrategy.INNER_PRODUCT, ) vs.apply_vector_index(index) assert vs.is_valid_index("secondindex") vs.drop_vector_index("secondindex") - vs.drop_vector_index() - - async def test_is_valid_index(self, vs): - is_valid = vs.is_valid_index("invalid_index") - assert is_valid == False + vs.drop_vector_index(DEFAULT_INDEX_NAME) @pytest.mark.asyncio(loop_scope="class") @@ -247,44 +224,10 @@ async def omni_vs(self, omni_engine): ) yield vs - async def test_aapply_vector_index(self, vs): - index = HNSWIndex() - await vs.aapply_vector_index(index) - assert await vs.ais_valid_index(DEFAULT_INDEX_NAME_ASYNC) - - async def test_areindex(self, vs): - if not await vs.ais_valid_index(DEFAULT_INDEX_NAME_ASYNC): - index = HNSWIndex() - await vs.aapply_vector_index(index) - await vs.areindex() - await vs.areindex(DEFAULT_INDEX_NAME_ASYNC) - assert await vs.ais_valid_index(DEFAULT_INDEX_NAME_ASYNC) - await vs.adrop_vector_index(DEFAULT_INDEX_NAME_ASYNC) - - async def test_dropindex(self, vs): - await vs.adrop_vector_index() - result = await vs.ais_valid_index(DEFAULT_INDEX_NAME_ASYNC) - assert not result - - async def test_aapply_vector_index_ivfflat(self, vs): - index = IVFFlatIndex(distance_strategy=DistanceStrategy.EUCLIDEAN) - await vs.aapply_vector_index(index, concurrently=True) - assert await vs.ais_valid_index(DEFAULT_INDEX_NAME_ASYNC) - index = IVFFlatIndex( - name="secondindex", - distance_strategy=DistanceStrategy.INNER_PRODUCT, - ) - await vs.aapply_vector_index(index) - assert await vs.ais_valid_index("secondindex") - await vs.adrop_vector_index("secondindex") - await vs.adrop_vector_index() - - async def test_is_valid_index(self, vs): - is_valid = await vs.ais_valid_index("invalid_index") - assert is_valid == False - async def test_aapply_vector_index_ivf(self, vs): - index = IVFIndex(distance_strategy=DistanceStrategy.EUCLIDEAN) + index = IVFIndex( + name=DEFAULT_INDEX_NAME_ASYNC, distance_strategy=DistanceStrategy.EUCLIDEAN + ) await vs.aapply_vector_index(index, concurrently=True) assert await vs.ais_valid_index(DEFAULT_INDEX_NAME_ASYNC) index = IVFIndex( @@ -294,10 +237,12 @@ async def test_aapply_vector_index_ivf(self, vs): await vs.aapply_vector_index(index) assert await vs.ais_valid_index("secondindex") await vs.adrop_vector_index("secondindex") - await vs.adrop_vector_index() + await vs.adrop_vector_index(DEFAULT_INDEX_NAME_ASYNC) async def test_aapply_alloydb_scann_index_ScaNN(self, omni_vs): - index = ScaNNIndex(distance_strategy=DistanceStrategy.EUCLIDEAN) + index = ScaNNIndex( + name=DEFAULT_INDEX_NAME_OMNI, distance_strategy=DistanceStrategy.EUCLIDEAN + ) await omni_vs.aset_maintenance_work_mem(index.num_leaves, VECTOR_SIZE) await omni_vs.aapply_vector_index(index, concurrently=True) assert await omni_vs.ais_valid_index(DEFAULT_INDEX_NAME_OMNI) @@ -307,4 +252,4 @@ async def test_aapply_alloydb_scann_index_ScaNN(self, omni_vs): await omni_vs.aapply_vector_index(index) assert await omni_vs.ais_valid_index("secondindex") await omni_vs.adrop_vector_index("secondindex") - await omni_vs.adrop_vector_index() + await omni_vs.adrop_vector_index(DEFAULT_INDEX_NAME_OMNI) diff --git a/tests/test_vectorstore_search.py b/tests/test_vectorstore_search.py index 721f1e95..4eec7635 100644 --- a/tests/test_vectorstore_search.py +++ b/tests/test_vectorstore_search.py @@ -19,22 +19,14 @@ import pytest_asyncio from langchain_core.documents import Document from langchain_core.embeddings import DeterministicFakeEmbedding -from metadata_filtering_data import FILTERING_TEST_CASES, METADATAS, NEGATIVE_TEST_CASES +from langchain_postgres.v2.indexes import DistanceStrategy, HNSWQueryOptions from PIL import Image -from sqlalchemy import RowMapping, Sequence, text +from sqlalchemy import text -from langchain_google_alloydb_pg import AlloyDBEngine, AlloyDBVectorStore, Column -from langchain_google_alloydb_pg.indexes import DistanceStrategy, HNSWQueryOptions +from langchain_google_alloydb_pg import AlloyDBEngine, AlloyDBVectorStore -DEFAULT_TABLE = "test_table" + str(uuid.uuid4()).replace("-", "_") -DEFAULT_TABLE_SYNC = "test_table" + str(uuid.uuid4()).replace("-", "_") -CUSTOM_TABLE = "test_table_custom" + str(uuid.uuid4()).replace("-", "_") IMAGE_TABLE = "test_image_table" + str(uuid.uuid4()).replace("-", "_") IMAGE_TABLE_SYNC = "test_image_table_sync" + str(uuid.uuid4()).replace("-", "_") -CUSTOM_FILTER_TABLE = "test_table_custom_filter" + str(uuid.uuid4()).replace("-", "_") -CUSTOM_FILTER_TABLE_SYNC = "test_table_custom_filter_sync" + str(uuid.uuid4()).replace( - "-", "_" -) VECTOR_SIZE = 768 embeddings_service = DeterministicFakeEmbedding(size=VECTOR_SIZE) @@ -49,9 +41,6 @@ docs = [ Document(page_content=texts[i], metadata=metadatas[i]) for i in range(len(texts)) ] -filter_docs = [ - Document(page_content=texts[i], metadata=METADATAS[i]) for i in range(len(texts)) -] embeddings = [embeddings_service.embed_query("foo") for i in range(len(texts))] @@ -116,103 +105,9 @@ async def engine(self, db_project, db_region, db_cluster, db_instance, db_name): database=db_name, ) yield engine - await aexecute(engine, f"DROP TABLE IF EXISTS {DEFAULT_TABLE}") - await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_FILTER_TABLE}") - await engine.close() - - @pytest_asyncio.fixture(scope="class") - async def vs(self, engine): - await engine.ainit_vectorstore_table( - DEFAULT_TABLE, VECTOR_SIZE, store_metadata=False - ) - vs = await AlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=DEFAULT_TABLE, - ) - - await vs.aadd_documents(docs, ids=ids) - yield vs - - @pytest_asyncio.fixture(scope="class") - async def engine_sync( - self, db_project, db_region, db_cluster, db_instance, db_name - ): - engine = AlloyDBEngine.from_instance( - project_id=db_project, - cluster=db_cluster, - instance=db_instance, - region=db_region, - database=db_name, - ) - yield engine - await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_TABLE}") + await aexecute(engine, f"DROP TABLE IF EXISTS {IMAGE_TABLE}") await engine.close() - @pytest_asyncio.fixture(scope="class") - async def vs_custom(self, engine_sync): - engine_sync.init_vectorstore_table( - CUSTOM_TABLE, - VECTOR_SIZE, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=[ - Column("page", "TEXT"), - Column("source", "TEXT"), - ], - store_metadata=False, - ) - - vs_custom = AlloyDBVectorStore.create_sync( - engine_sync, - embedding_service=embeddings_service, - table_name=CUSTOM_TABLE, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - index_query_options=HNSWQueryOptions(ef_search=1), - ) - vs_custom.add_documents(docs, ids=ids) - yield vs_custom - - @pytest_asyncio.fixture(scope="class") - async def vs_custom_filter(self, engine): - await engine.ainit_vectorstore_table( - CUSTOM_FILTER_TABLE, - VECTOR_SIZE, - metadata_columns=[ - Column("name", "TEXT"), - Column("code", "TEXT"), - Column("price", "FLOAT"), - Column("is_available", "BOOLEAN"), - Column("tags", "TEXT[]"), - Column("inventory_location", "INTEGER[]"), - Column("available_quantity", "INTEGER", nullable=True), - ], - id_column="langchain_id", - store_metadata=False, - overwrite_existing=True, - ) - - vs_custom_filter = await AlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=CUSTOM_FILTER_TABLE, - metadata_columns=[ - "name", - "code", - "price", - "is_available", - "tags", - "inventory_location", - "available_quantity", - ], - id_column="langchain_id", - ) - await vs_custom_filter.aadd_documents(filter_docs, ids=ids) - yield vs_custom_filter - @pytest_asyncio.fixture(scope="class") async def image_uris(self): red_uri = str(uuid.uuid4()).replace("-", "_") + "test_image_red.jpg" @@ -242,17 +137,9 @@ async def image_vs(self, engine, image_uris): table_name=IMAGE_TABLE, distance_strategy=DistanceStrategy.COSINE_DISTANCE, ) - ids = [str(uuid.uuid4()) for i in range(len(image_uris))] await vs.aadd_images(image_uris, ids=ids) yield vs - async def test_asimilarity_search(self, vs): - results = await vs.asimilarity_search("foo", k=1) - assert len(results) == 1 - assert results == [Document(page_content="foo", id=ids[0])] - results = await vs.asimilarity_search("foo", k=1, filter="content = 'bar'") - assert results == [Document(page_content="bar", id=ids[1])] - async def test_asimilarity_search_image(self, image_vs, image_uris): results = await image_vs.asimilarity_search_image(image_uris[0], k=1) assert len(results) == 1 @@ -261,110 +148,6 @@ async def test_asimilarity_search_image(self, image_vs, image_uris): assert len(results) == 1 assert results[0].metadata["image_uri"] == image_uris[3] - async def test_asimilarity_search_score(self, vs): - results = await vs.asimilarity_search_with_score("foo") - assert len(results) == 4 - assert results[0][0] == Document(page_content="foo", id=ids[0]) - assert results[0][1] == 0 - - async def test_asimilarity_search_by_vector(self, vs): - embedding = embeddings_service.embed_query("foo") - results = await vs.asimilarity_search_by_vector(embedding) - assert len(results) == 4 - assert results[0] == Document(page_content="foo", id=ids[0]) - results = await vs.asimilarity_search_with_score_by_vector(embedding) - assert results[0][0] == Document(page_content="foo", id=ids[0]) - assert results[0][1] == 0 - - async def test_similarity_search_with_relevance_scores_threshold_cosine(self, vs): - score_threshold = {"score_threshold": 0} - results = await vs.asimilarity_search_with_relevance_scores( - "foo", **score_threshold - ) - # Note: Since tests use FakeEmbeddings which are non-normalized vectors, results might have scores beyond the range [0,1]. - # For a normalized embedding service, a threshold of zero will yield all matched documents. - assert len(results) == 2 - - score_threshold = {"score_threshold": 0.02} - results = await vs.asimilarity_search_with_relevance_scores( - "foo", **score_threshold - ) - assert len(results) == 2 - - score_threshold = {"score_threshold": 0.9} - results = await vs.asimilarity_search_with_relevance_scores( - "foo", **score_threshold - ) - assert len(results) == 1 - assert results[0][0] == Document(page_content="foo", id=ids[0]) - - async def test_similarity_search_with_relevance_scores_threshold_euclidean( - self, engine - ): - vs = await AlloyDBVectorStore.create( - engine, - embedding_service=embeddings_service, - table_name=DEFAULT_TABLE, - distance_strategy=DistanceStrategy.EUCLIDEAN, - ) - - score_threshold = {"score_threshold": 0.9} - results = await vs.asimilarity_search_with_relevance_scores( - "foo", **score_threshold - ) - assert len(results) == 1 - assert results[0][0] == Document(page_content="foo", id=ids[0]) - - async def test_amax_marginal_relevance_search(self, vs): - results = await vs.amax_marginal_relevance_search("bar") - assert results[0] == Document(page_content="bar", id=ids[1]) - results = await vs.amax_marginal_relevance_search( - "bar", filter="content = 'boo'" - ) - assert results[0] == Document(page_content="boo", id=ids[3]) - - async def test_amax_marginal_relevance_search_vector(self, vs): - embedding = embeddings_service.embed_query("bar") - results = await vs.amax_marginal_relevance_search_by_vector(embedding) - assert results[0] == Document(page_content="bar", id=ids[1]) - - async def test_amax_marginal_relevance_search_vector_score(self, vs): - embedding = embeddings_service.embed_query("bar") - results = await vs.amax_marginal_relevance_search_with_score_by_vector( - embedding - ) - assert results[0][0] == Document(page_content="bar", id=ids[1]) - - results = await vs.amax_marginal_relevance_search_with_score_by_vector( - embedding, lambda_mult=0.75, fetch_k=10 - ) - assert results[0][0] == Document(page_content="bar", id=ids[1]) - - async def test_aget_by_ids(self, vs): - test_ids = [ids[0]] - results = await vs.aget_by_ids(ids=test_ids) - - assert results[0] == Document(page_content="foo", id=ids[0]) - - async def test_aget_by_ids_custom_vs(self, vs_custom): - test_ids = [ids[0]] - results = await vs_custom.aget_by_ids(ids=test_ids) - - assert results[0] == Document(page_content="foo", id=ids[0]) - - @pytest.mark.parametrize("test_filter, expected_ids", FILTERING_TEST_CASES) - async def test_vectorstore_with_metadata_filters( - self, - vs_custom_filter, - test_filter, - expected_ids, - ): - """Test end to end construction and search.""" - docs = await vs_custom_filter.asimilarity_search( - "meow", k=5, filter=test_filter - ) - assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter - class TestVectorStoreSearchSync: @pytest.fixture(scope="module") @@ -399,75 +182,9 @@ async def engine_sync( database=db_name, ) yield engine - await aexecute(engine, f"DROP TABLE IF EXISTS {DEFAULT_TABLE_SYNC}") - await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_FILTER_TABLE_SYNC}") + await aexecute(engine, f"DROP TABLE IF EXISTS {IMAGE_TABLE_SYNC}") await engine.close() - @pytest_asyncio.fixture(scope="class") - async def vs_custom(self, engine_sync): - engine_sync.init_vectorstore_table( - DEFAULT_TABLE_SYNC, - VECTOR_SIZE, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - metadata_columns=[ - Column("page", "TEXT"), - Column("source", "TEXT"), - ], - store_metadata=False, - ) - - vs_custom = await AlloyDBVectorStore.create( - engine_sync, - embedding_service=embeddings_service, - table_name=DEFAULT_TABLE_SYNC, - id_column="myid", - content_column="mycontent", - embedding_column="myembedding", - index_query_options=HNSWQueryOptions(ef_search=1), - ) - vs_custom.add_documents(docs, ids=ids) - yield vs_custom - - @pytest_asyncio.fixture(scope="class") - async def vs_custom_filter_sync(self, engine_sync): - engine_sync.init_vectorstore_table( - CUSTOM_FILTER_TABLE_SYNC, - VECTOR_SIZE, - metadata_columns=[ - Column("name", "TEXT"), - Column("code", "TEXT"), - Column("price", "FLOAT"), - Column("is_available", "BOOLEAN"), - Column("tags", "TEXT[]"), - Column("inventory_location", "INTEGER[]"), - Column("available_quantity", "INTEGER", nullable=True), - ], - id_column="langchain_id", - store_metadata=False, - overwrite_existing=True, - ) - - vs_custom_filter_sync = await AlloyDBVectorStore.create( - engine_sync, - embedding_service=embeddings_service, - table_name=CUSTOM_FILTER_TABLE_SYNC, - metadata_columns=[ - "name", - "code", - "price", - "is_available", - "tags", - "inventory_location", - "available_quantity", - ], - id_column="langchain_id", - ) - - vs_custom_filter_sync.add_documents(filter_docs, ids=ids) - yield vs_custom_filter_sync - @pytest_asyncio.fixture(scope="class") async def image_uris(self): red_uri = str(uuid.uuid4()).replace("-", "_") + "test_image_red.jpg" @@ -493,83 +210,10 @@ def image_vs(self, engine_sync, image_uris): table_name=IMAGE_TABLE_SYNC, distance_strategy=DistanceStrategy.COSINE_DISTANCE, ) - ids = [str(uuid.uuid4()) for i in range(len(image_uris))] vs.add_images(image_uris, ids=ids) yield vs - def test_similarity_search(self, vs_custom): - results = vs_custom.similarity_search("foo", k=1) - assert len(results) == 1 - assert results == [Document(page_content="foo", id=ids[0])] - results = vs_custom.similarity_search("foo", k=1, filter="mycontent = 'bar'") - assert results == [Document(page_content="bar", id=ids[1])] - def test_similarity_search_image(self, image_vs, image_uris): results = image_vs.similarity_search_image(image_uris[0], k=1) assert len(results) == 1 assert results[0].metadata["image_uri"] == image_uris[0] - - def test_similarity_search_score(self, vs_custom): - results = vs_custom.similarity_search_with_score("foo") - assert len(results) == 4 - assert results[0][0] == Document(page_content="foo", id=ids[0]) - assert results[0][1] == 0 - - def test_similarity_search_by_vector(self, vs_custom): - embedding = embeddings_service.embed_query("foo") - results = vs_custom.similarity_search_by_vector(embedding) - assert len(results) == 4 - assert results[0] == Document(page_content="foo", id=ids[0]) - results = vs_custom.similarity_search_with_score_by_vector(embedding) - assert results[0][0] == Document(page_content="foo", id=ids[0]) - assert results[0][1] == 0 - - def test_max_marginal_relevance_search(self, vs_custom): - results = vs_custom.max_marginal_relevance_search("bar") - assert results[0] == Document(page_content="bar", id=ids[1]) - results = vs_custom.max_marginal_relevance_search( - "bar", filter="mycontent = 'boo'" - ) - assert results[0] == Document(page_content="boo", id=ids[3]) - - def test_max_marginal_relevance_search_vector(self, vs_custom): - embedding = embeddings_service.embed_query("bar") - results = vs_custom.max_marginal_relevance_search_by_vector(embedding) - assert results[0] == Document(page_content="bar", id=ids[1]) - - def test_max_marginal_relevance_search_vector_score(self, vs_custom): - embedding = embeddings_service.embed_query("bar") - results = vs_custom.max_marginal_relevance_search_with_score_by_vector( - embedding - ) - assert results[0][0] == Document(page_content="bar", id=ids[1]) - - results = vs_custom.max_marginal_relevance_search_with_score_by_vector( - embedding, lambda_mult=0.75, fetch_k=10 - ) - assert results[0][0] == Document(page_content="bar", id=ids[1]) - - def test_get_by_ids_custom_vs(self, vs_custom): - test_ids = [ids[0]] - results = vs_custom.get_by_ids(ids=test_ids) - - assert results[0] == Document(page_content="foo", id=ids[0]) - - @pytest.mark.parametrize("test_filter, expected_ids", FILTERING_TEST_CASES) - def test_sync_vectorstore_with_metadata_filters( - self, - vs_custom_filter_sync, - test_filter, - expected_ids, - ): - """Test end to end construction and search.""" - - docs = vs_custom_filter_sync.similarity_search("meow", k=5, filter=test_filter) - assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter - - @pytest.mark.parametrize("test_filter", NEGATIVE_TEST_CASES) - def test_metadata_filter_negative_tests(self, vs_custom_filter_sync, test_filter): - with pytest.raises((ValueError, NotImplementedError)): - docs = vs_custom_filter_sync.similarity_search( - "meow", k=5, filter=test_filter - )