From 24d121ed88ccd4aa22a01217e8bcb815d02a652d Mon Sep 17 00:00:00 2001 From: vbarda Date: Sun, 14 Jul 2024 18:47:43 -0400 Subject: [PATCH 1/4] add docs from langgraph --- backend/ingest.py | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/backend/ingest.py b/backend/ingest.py index b5491b244..7eeb46a24 100644 --- a/backend/ingest.py +++ b/backend/ingest.py @@ -2,6 +2,7 @@ import logging import os import re +from typing import Optional import weaviate from bs4 import BeautifulSoup, SoupStrainer @@ -24,15 +25,19 @@ def get_embeddings_model() -> Embeddings: return OpenAIEmbeddings(model="text-embedding-3-small", chunk_size=200) -def metadata_extractor(meta: dict, soup: BeautifulSoup) -> dict: - title = soup.find("title") - description = soup.find("meta", attrs={"name": "description"}) - html = soup.find("html") +def metadata_extractor(meta: dict, soup: BeautifulSoup, title_suffix: Optional[str] = None) -> dict: + title_element = soup.find("title") + description_element = soup.find("meta", attrs={"name": "description"}) + html_element = soup.find("html") + title = title_element.get_text() if title_element else "" + if title_suffix is not None: + title += title_suffix + return { "source": meta["loc"], - "title": title.get_text() if title else "", - "description": description.get("content", "") if description else "", - "language": html.get("lang", "") if html else "", + "title": title, + "description": description_element.get("content", "") if description_element else "", + "language": html_element.get("lang", "") if html_element else "", **meta, } @@ -48,7 +53,19 @@ def load_langchain_docs(): name=("article", "title", "html", "lang", "content") ), }, - meta_function=metadata_extractor, + meta_function=metadata_extractor + ).load() + + +def load_langgraph_docs(): + return SitemapLoader( + "https://langchain-ai.github.io/langgraph/sitemap.xml", + parsing_function=simple_extractor, + default_parser="lxml", + bs_kwargs={ + "parse_only": SoupStrainer(name=("article", "title")) + }, + meta_function=lambda meta, soup: metadata_extractor(meta, soup, title_suffix=" | 🦜🕸️LangGraph"), ).load() @@ -69,8 +86,13 @@ def load_langsmith_docs(): ).load() -def simple_extractor(html: str) -> str: - soup = BeautifulSoup(html, "lxml") +def simple_extractor(html: str | BeautifulSoup) -> str: + if isinstance(html, str): + soup = BeautifulSoup(html, "lxml") + elif isinstance(html, BeautifulSoup): + soup = html + else: + raise ValueError("Input should be either BeautifulSoup object or an HTML string") return re.sub(r"\n\n+", "\n\n", soup.text).strip() @@ -126,7 +148,9 @@ def ingest_docs(): docs_from_api = load_api_docs() logger.info(f"Loaded {len(docs_from_api)} docs from API") docs_from_langsmith = load_langsmith_docs() - logger.info(f"Loaded {len(docs_from_langsmith)} docs from Langsmith") + logger.info(f"Loaded {len(docs_from_langsmith)} docs from LangSmith") + docs_from_langgraph = load_langgraph_docs() + logger.info(f"Loaded {len(docs_from_langgraph)} docs from LangGraph") docs_transformed = text_splitter.split_documents( docs_from_documentation + docs_from_api + docs_from_langsmith From 0a9bf02d6db5d2df000263e042036ef7a5b73fb4 Mon Sep 17 00:00:00 2001 From: vbarda Date: Mon, 15 Jul 2024 10:32:57 -0400 Subject: [PATCH 2/4] lint --- backend/ingest.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/backend/ingest.py b/backend/ingest.py index 7eeb46a24..dd285e315 100644 --- a/backend/ingest.py +++ b/backend/ingest.py @@ -25,7 +25,9 @@ def get_embeddings_model() -> Embeddings: return OpenAIEmbeddings(model="text-embedding-3-small", chunk_size=200) -def metadata_extractor(meta: dict, soup: BeautifulSoup, title_suffix: Optional[str] = None) -> dict: +def metadata_extractor( + meta: dict, soup: BeautifulSoup, title_suffix: Optional[str] = None +) -> dict: title_element = soup.find("title") description_element = soup.find("meta", attrs={"name": "description"}) html_element = soup.find("html") @@ -36,7 +38,9 @@ def metadata_extractor(meta: dict, soup: BeautifulSoup, title_suffix: Optional[s return { "source": meta["loc"], "title": title, - "description": description_element.get("content", "") if description_element else "", + "description": description_element.get("content", "") + if description_element + else "", "language": html_element.get("lang", "") if html_element else "", **meta, } @@ -53,7 +57,7 @@ def load_langchain_docs(): name=("article", "title", "html", "lang", "content") ), }, - meta_function=metadata_extractor + meta_function=metadata_extractor, ).load() @@ -62,10 +66,10 @@ def load_langgraph_docs(): "https://langchain-ai.github.io/langgraph/sitemap.xml", parsing_function=simple_extractor, default_parser="lxml", - bs_kwargs={ - "parse_only": SoupStrainer(name=("article", "title")) - }, - meta_function=lambda meta, soup: metadata_extractor(meta, soup, title_suffix=" | 🦜🕸️LangGraph"), + bs_kwargs={"parse_only": SoupStrainer(name=("article", "title"))}, + meta_function=lambda meta, soup: metadata_extractor( + meta, soup, title_suffix=" | 🦜🕸️LangGraph" + ), ).load() @@ -92,7 +96,9 @@ def simple_extractor(html: str | BeautifulSoup) -> str: elif isinstance(html, BeautifulSoup): soup = html else: - raise ValueError("Input should be either BeautifulSoup object or an HTML string") + raise ValueError( + "Input should be either BeautifulSoup object or an HTML string" + ) return re.sub(r"\n\n+", "\n\n", soup.text).strip() From 49aef50805182e9e202d94f124283674a9377c63 Mon Sep 17 00:00:00 2001 From: vbarda Date: Mon, 15 Jul 2024 21:00:40 -0400 Subject: [PATCH 3/4] lol --- backend/ingest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/ingest.py b/backend/ingest.py index 488e47693..62358cee9 100644 --- a/backend/ingest.py +++ b/backend/ingest.py @@ -159,7 +159,7 @@ def ingest_docs(): logger.info(f"Loaded {len(docs_from_langgraph)} docs from LangGraph") docs_transformed = text_splitter.split_documents( - docs_from_documentation + docs_from_api + docs_from_langsmith + docs_from_documentation + docs_from_api + docs_from_langsmith + docs_from_langgraph ) docs_transformed = [doc for doc in docs_transformed if len(doc.page_content) > 10] From 2fad584d82fd016811ff04a545c3c6860bb4a609 Mon Sep 17 00:00:00 2001 From: vbarda Date: Mon, 15 Jul 2024 21:06:18 -0400 Subject: [PATCH 4/4] lint --- backend/ingest.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/backend/ingest.py b/backend/ingest.py index 62358cee9..b79a34021 100644 --- a/backend/ingest.py +++ b/backend/ingest.py @@ -159,7 +159,10 @@ def ingest_docs(): logger.info(f"Loaded {len(docs_from_langgraph)} docs from LangGraph") docs_transformed = text_splitter.split_documents( - docs_from_documentation + docs_from_api + docs_from_langsmith + docs_from_langgraph + docs_from_documentation + + docs_from_api + + docs_from_langsmith + + docs_from_langgraph ) docs_transformed = [doc for doc in docs_transformed if len(doc.page_content) > 10]