From 3191352c9eab9857b3a4494df1c44dc08e30b0bf Mon Sep 17 00:00:00 2001 From: isaac hershenson Date: Mon, 5 Aug 2024 17:22:36 -0700 Subject: [PATCH 1/2] draft --- langchain_postgres/vectorstores.py | 190 +++++++++++++++++------------ 1 file changed, 111 insertions(+), 79 deletions(-) diff --git a/langchain_postgres/vectorstores.py b/langchain_postgres/vectorstores.py index 6afdd98b..2dfb4b1d 100644 --- a/langchain_postgres/vectorstores.py +++ b/langchain_postgres/vectorstores.py @@ -246,98 +246,130 @@ def _create_vector_extension(conn: Connection) -> None: class PGVector(VectorStore): - """Vectorstore implementation using Postgres as the backend. + """Postgres vector store integration. - Currently, there is no mechanism for supporting data migration. + Setup: + Install ``langchain_postgres`` and run the docker container. - So breaking changes in the vectorstore schema will require the user to recreate - the tables and re-add the documents. + .. code-block:: bash - If this is a concern, please use a different vectorstore. If - not, this implementation should be fine for your use case. + pip install -qU langchain-postgres + docker run --name pgvector-container -e POSTGRES_USER=langchain -e POSTGRES_PASSWORD=langchain -e POSTGRES_DB=langchain -p 6024:5432 -d pgvector/pgvector:pg16 - To use this vectorstore you need to have the `vector` extension installed. - The `vector` extension is a Postgres extension that provides vector - similarity search capabilities. + Key init args — indexing params: + collection_name: str + Name of the collection. + embeddings: Embeddings + Embedding function to use. - ```sh - docker run --name pgvector-container -e POSTGRES_PASSWORD=... - -d pgvector/pgvector:pg16 - ``` - - Example: + Key init args — client params: + connection: Union[None, DBConnection, Engine, AsyncEngine, str] + Connection string or engine. + + Instantiate: .. code-block:: python + from langchain_postgres import PGVector from langchain_postgres.vectorstores import PGVector - from langchain_openai.embeddings import OpenAIEmbeddings - - connection_string = "postgresql+psycopg://..." - collection_name = "state_of_the_union_test" - embeddings = OpenAIEmbeddings() - vectorstore = PGVector.from_documents( - embedding=embeddings, - documents=docs, - connection=connection_string, + from langchain_openai import OpenAIEmbeddings + + # See docker command above to launch a postgres instance with pgvector enabled. + connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain" # Uses psycopg3! + collection_name = "my_docs" + + vector_store = PGVector( + embeddings=OpenAIEmbeddings(model="text-embedding-3-large"), collection_name=collection_name, + connection=connection, use_jsonb=True, - async_mode=False, ) + Add Documents: + .. code-block:: python + + from langchain_core.documents import Document + + document_1 = Document(page_content="foo", metadata={"baz": "bar"}) + document_2 = Document(page_content="thud", metadata={"bar": "baz"}) + document_3 = Document(page_content="i will be deleted :(") + + documents = [document_1, document_2, document_3] + ids = ["1", "2", "3"] + vector_store.add_documents(documents=documents, ids=ids) + + Delete Documents: + .. code-block:: python + + vector_store.delete(ids=["3"]) + + Search: + .. code-block:: python + + results = vector_store.similarity_search(query="thud",k=1) + for doc in results: + print(f"* {doc.page_content} [{doc.metadata}]") + + .. code-block:: python + + * thud [{'bar': 'baz'}] + + Search with filter: + .. code-block:: python + + results = vector_store.similarity_search(query="thud",k=1,filter={"bar": "baz"}) + for doc in results: + print(f"* {doc.page_content} [{doc.metadata}]") + + .. code-block:: python + + * thud [{'bar': 'baz'}] + + Search with score: + .. code-block:: python + + results = vector_store.similarity_search_with_score(query="qux",k=1) + for doc, score in results: + print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]") + + .. code-block:: python + + * [SIM=0.499243] foo [{'baz': 'bar'}] + + Async: + .. code-block:: python + + # add documents + # await vector_store.aadd_documents(documents=documents, ids=ids) + + # delete documents + # await vector_store.adelete(ids=["3"]) + + # search + # results = vector_store.asimilarity_search(query="thud",k=1) + + # search with score + results = await vector_store.asimilarity_search_with_score(query="qux",k=1) + for doc,score in results: + print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]") + + .. code-block:: python + + * [SIM=0.499243] foo [{'baz': 'bar'}] + + Use as Retriever: + .. code-block:: python + + retriever = vector_store.as_retriever( + search_type="mmr", + search_kwargs={"k": 1, "fetch_k": 2, "lambda_mult": 0.5}, + ) + retriever.invoke("thud") + + .. code-block:: python + + [Document(metadata={'bar': 'baz'}, page_content='thud')] - This code has been ported over from langchain_community with minimal changes - to allow users to easily transition from langchain_community to langchain_postgres. - - Some changes had to be made to address issues with the community implementation: - * langchain_postgres now works with psycopg3. Please update your - connection strings from `postgresql+psycopg2://...` to - `postgresql+psycopg://langchain:langchain@...` - (yes, the driver name is `psycopg` not `psycopg3`) - * The schema of the embedding store and collection have been changed to make - add_documents work correctly with user specified ids, specifically - when overwriting existing documents. - You will need to recreate the tables if you are using an existing database. - * A Connection object has to be provided explicitly. Connections will not be - picked up automatically based on env variables. - * langchain_postgres now accept async connections. If you want to use the async - version, you need to set `async_mode=True` when initializing the store or - use an async engine. - - Supported filter operators: - - * $eq: Equality operator - * $ne: Not equal operator - * $lt: Less than operator - * $lte: Less than or equal operator - * $gt: Greater than operator - * $gte: Greater than or equal operator - * $in: In operator - * $nin: Not in operator - * $between: Between operator - * $exists: Exists operator - * $like: Like operator - * $ilike: Case insensitive like operator - * $and: Logical AND operator - * $or: Logical OR operator - * $not: Logical NOT operator - - Example: - - .. code-block:: python - - vectorstore.similarity_search('kitty', k=10, filter={ - 'id': {'$in': [1, 5, 2, 9]} - }) - #%% md - - If you provide a dict with multiple fields, but no operators, - the top level will be interpreted as a logical **AND** filter - - vectorstore.similarity_search('ducks', k=10, filter={ - 'id': {'$in': [1, 5, 2, 9]}, - 'location': {'$in': ["pond", "market"]} - }) - - """ + """ # noqa: E501 def __init__( self, From b4c51ddfe2676644b04b6d190af0ae5328227070 Mon Sep 17 00:00:00 2001 From: isaac hershenson Date: Tue, 6 Aug 2024 11:56:22 -0700 Subject: [PATCH 2/2] fmt --- langchain_postgres/vectorstores.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langchain_postgres/vectorstores.py b/langchain_postgres/vectorstores.py index 2dfb4b1d..a05fe99a 100644 --- a/langchain_postgres/vectorstores.py +++ b/langchain_postgres/vectorstores.py @@ -265,7 +265,7 @@ class PGVector(VectorStore): Key init args — client params: connection: Union[None, DBConnection, Engine, AsyncEngine, str] Connection string or engine. - + Instantiate: .. code-block:: python