From 24d121ed88ccd4aa22a01217e8bcb815d02a652d Mon Sep 17 00:00:00 2001
From: vbarda <vadym@langchain.dev>
Date: Sun, 14 Jul 2024 18:47:43 -0400
Subject: [PATCH 1/4] add docs from langgraph

---
 backend/ingest.py | 46 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/backend/ingest.py b/backend/ingest.py
index b5491b244..7eeb46a24 100644
--- a/backend/ingest.py
+++ b/backend/ingest.py
@@ -2,6 +2,7 @@
 import logging
 import os
 import re
+from typing import Optional
 
 import weaviate
 from bs4 import BeautifulSoup, SoupStrainer
@@ -24,15 +25,19 @@ def get_embeddings_model() -> Embeddings:
     return OpenAIEmbeddings(model="text-embedding-3-small", chunk_size=200)
 
 
-def metadata_extractor(meta: dict, soup: BeautifulSoup) -> dict:
-    title = soup.find("title")
-    description = soup.find("meta", attrs={"name": "description"})
-    html = soup.find("html")
+def metadata_extractor(meta: dict, soup: BeautifulSoup, title_suffix: Optional[str] = None) -> dict:
+    title_element = soup.find("title")
+    description_element = soup.find("meta", attrs={"name": "description"})
+    html_element = soup.find("html")
+    title = title_element.get_text() if title_element else ""
+    if title_suffix is not None:
+        title += title_suffix
+
     return {
         "source": meta["loc"],
-        "title": title.get_text() if title else "",
-        "description": description.get("content", "") if description else "",
-        "language": html.get("lang", "") if html else "",
+        "title": title,
+        "description": description_element.get("content", "") if description_element else "",
+        "language": html_element.get("lang", "") if html_element else "",
         **meta,
     }
 
@@ -48,7 +53,19 @@ def load_langchain_docs():
                 name=("article", "title", "html", "lang", "content")
             ),
         },
-        meta_function=metadata_extractor,
+        meta_function=metadata_extractor
+    ).load()
+
+
+def load_langgraph_docs():
+    return SitemapLoader(
+        "https://langchain-ai.github.io/langgraph/sitemap.xml",
+        parsing_function=simple_extractor,
+        default_parser="lxml",
+        bs_kwargs={
+            "parse_only": SoupStrainer(name=("article", "title"))
+        },
+        meta_function=lambda meta, soup: metadata_extractor(meta, soup, title_suffix=" | 🦜🕸️LangGraph"),
     ).load()
 
 
@@ -69,8 +86,13 @@ def load_langsmith_docs():
     ).load()
 
 
-def simple_extractor(html: str) -> str:
-    soup = BeautifulSoup(html, "lxml")
+def simple_extractor(html: str | BeautifulSoup) -> str:
+    if isinstance(html, str):
+        soup = BeautifulSoup(html, "lxml")
+    elif isinstance(html, BeautifulSoup):
+        soup = html
+    else:
+        raise ValueError("Input should be either BeautifulSoup object or an HTML string")
     return re.sub(r"\n\n+", "\n\n", soup.text).strip()
 
 
@@ -126,7 +148,9 @@ def ingest_docs():
     docs_from_api = load_api_docs()
     logger.info(f"Loaded {len(docs_from_api)} docs from API")
     docs_from_langsmith = load_langsmith_docs()
-    logger.info(f"Loaded {len(docs_from_langsmith)} docs from Langsmith")
+    logger.info(f"Loaded {len(docs_from_langsmith)} docs from LangSmith")
+    docs_from_langgraph = load_langgraph_docs()
+    logger.info(f"Loaded {len(docs_from_langgraph)} docs from LangGraph")
 
     docs_transformed = text_splitter.split_documents(
         docs_from_documentation + docs_from_api + docs_from_langsmith

From 0a9bf02d6db5d2df000263e042036ef7a5b73fb4 Mon Sep 17 00:00:00 2001
From: vbarda <vadym@langchain.dev>
Date: Mon, 15 Jul 2024 10:32:57 -0400
Subject: [PATCH 2/4] lint

---
 backend/ingest.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/backend/ingest.py b/backend/ingest.py
index 7eeb46a24..dd285e315 100644
--- a/backend/ingest.py
+++ b/backend/ingest.py
@@ -25,7 +25,9 @@ def get_embeddings_model() -> Embeddings:
     return OpenAIEmbeddings(model="text-embedding-3-small", chunk_size=200)
 
 
-def metadata_extractor(meta: dict, soup: BeautifulSoup, title_suffix: Optional[str] = None) -> dict:
+def metadata_extractor(
+    meta: dict, soup: BeautifulSoup, title_suffix: Optional[str] = None
+) -> dict:
     title_element = soup.find("title")
     description_element = soup.find("meta", attrs={"name": "description"})
     html_element = soup.find("html")
@@ -36,7 +38,9 @@ def metadata_extractor(meta: dict, soup: BeautifulSoup, title_suffix: Optional[s
     return {
         "source": meta["loc"],
         "title": title,
-        "description": description_element.get("content", "") if description_element else "",
+        "description": description_element.get("content", "")
+        if description_element
+        else "",
         "language": html_element.get("lang", "") if html_element else "",
         **meta,
     }
@@ -53,7 +57,7 @@ def load_langchain_docs():
                 name=("article", "title", "html", "lang", "content")
             ),
         },
-        meta_function=metadata_extractor
+        meta_function=metadata_extractor,
     ).load()
 
 
@@ -62,10 +66,10 @@ def load_langgraph_docs():
         "https://langchain-ai.github.io/langgraph/sitemap.xml",
         parsing_function=simple_extractor,
         default_parser="lxml",
-        bs_kwargs={
-            "parse_only": SoupStrainer(name=("article", "title"))
-        },
-        meta_function=lambda meta, soup: metadata_extractor(meta, soup, title_suffix=" | 🦜🕸️LangGraph"),
+        bs_kwargs={"parse_only": SoupStrainer(name=("article", "title"))},
+        meta_function=lambda meta, soup: metadata_extractor(
+            meta, soup, title_suffix=" | 🦜🕸️LangGraph"
+        ),
     ).load()
 
 
@@ -92,7 +96,9 @@ def simple_extractor(html: str | BeautifulSoup) -> str:
     elif isinstance(html, BeautifulSoup):
         soup = html
     else:
-        raise ValueError("Input should be either BeautifulSoup object or an HTML string")
+        raise ValueError(
+            "Input should be either BeautifulSoup object or an HTML string"
+        )
     return re.sub(r"\n\n+", "\n\n", soup.text).strip()
 
 

From 49aef50805182e9e202d94f124283674a9377c63 Mon Sep 17 00:00:00 2001
From: vbarda <vadym@langchain.dev>
Date: Mon, 15 Jul 2024 21:00:40 -0400
Subject: [PATCH 3/4] lol

---
 backend/ingest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/ingest.py b/backend/ingest.py
index 488e47693..62358cee9 100644
--- a/backend/ingest.py
+++ b/backend/ingest.py
@@ -159,7 +159,7 @@ def ingest_docs():
     logger.info(f"Loaded {len(docs_from_langgraph)} docs from LangGraph")
 
     docs_transformed = text_splitter.split_documents(
-        docs_from_documentation + docs_from_api + docs_from_langsmith
+        docs_from_documentation + docs_from_api + docs_from_langsmith + docs_from_langgraph
     )
     docs_transformed = [doc for doc in docs_transformed if len(doc.page_content) > 10]
 

From 2fad584d82fd016811ff04a545c3c6860bb4a609 Mon Sep 17 00:00:00 2001
From: vbarda <vadym@langchain.dev>
Date: Mon, 15 Jul 2024 21:06:18 -0400
Subject: [PATCH 4/4] lint

---
 backend/ingest.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/backend/ingest.py b/backend/ingest.py
index 62358cee9..b79a34021 100644
--- a/backend/ingest.py
+++ b/backend/ingest.py
@@ -159,7 +159,10 @@ def ingest_docs():
     logger.info(f"Loaded {len(docs_from_langgraph)} docs from LangGraph")
 
     docs_transformed = text_splitter.split_documents(
-        docs_from_documentation + docs_from_api + docs_from_langsmith + docs_from_langgraph
+        docs_from_documentation
+        + docs_from_api
+        + docs_from_langsmith
+        + docs_from_langgraph
     )
     docs_transformed = [doc for doc in docs_transformed if len(doc.page_content) > 10]