Skip to content

add langgraph docs #351

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 16, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 44 additions & 11 deletions backend/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import os
import re
from typing import Optional

import weaviate
from bs4 import BeautifulSoup, SoupStrainer
Expand All @@ -24,15 +25,23 @@ def get_embeddings_model() -> Embeddings:
return OpenAIEmbeddings(model="text-embedding-3-small", chunk_size=200)


def metadata_extractor(meta: dict, soup: BeautifulSoup) -> dict:
title = soup.find("title")
description = soup.find("meta", attrs={"name": "description"})
html = soup.find("html")
def metadata_extractor(
meta: dict, soup: BeautifulSoup, title_suffix: Optional[str] = None
) -> dict:
title_element = soup.find("title")
description_element = soup.find("meta", attrs={"name": "description"})
html_element = soup.find("html")
title = title_element.get_text() if title_element else ""
if title_suffix is not None:
title += title_suffix

return {
"source": meta["loc"],
"title": title.get_text() if title else "",
"description": description.get("content", "") if description else "",
"language": html.get("lang", "") if html else "",
"title": title,
"description": description_element.get("content", "")
if description_element
else "",
"language": html_element.get("lang", "") if html_element else "",
**meta,
}

Expand All @@ -52,6 +61,18 @@ def load_langchain_docs():
).load()


def load_langgraph_docs():
return SitemapLoader(
"https://langchain-ai.github.io/langgraph/sitemap.xml",
parsing_function=simple_extractor,
default_parser="lxml",
bs_kwargs={"parse_only": SoupStrainer(name=("article", "title"))},
meta_function=lambda meta, soup: metadata_extractor(
meta, soup, title_suffix=" | 🦜🕸️LangGraph"
),
).load()


def load_langsmith_docs():
return RecursiveUrlLoader(
url="https://docs.smith.langchain.com/",
Expand All @@ -69,8 +90,15 @@ def load_langsmith_docs():
).load()


def simple_extractor(html: str) -> str:
soup = BeautifulSoup(html, "lxml")
def simple_extractor(html: str | BeautifulSoup) -> str:
if isinstance(html, str):
soup = BeautifulSoup(html, "lxml")
elif isinstance(html, BeautifulSoup):
soup = html
else:
raise ValueError(
"Input should be either BeautifulSoup object or an HTML string"
)
return re.sub(r"\n\n+", "\n\n", soup.text).strip()


Expand Down Expand Up @@ -126,10 +154,15 @@ def ingest_docs():
docs_from_api = load_api_docs()
logger.info(f"Loaded {len(docs_from_api)} docs from API")
docs_from_langsmith = load_langsmith_docs()
logger.info(f"Loaded {len(docs_from_langsmith)} docs from Langsmith")
logger.info(f"Loaded {len(docs_from_langsmith)} docs from LangSmith")
docs_from_langgraph = load_langgraph_docs()
logger.info(f"Loaded {len(docs_from_langgraph)} docs from LangGraph")

docs_transformed = text_splitter.split_documents(
docs_from_documentation + docs_from_api + docs_from_langsmith
docs_from_documentation
+ docs_from_api
+ docs_from_langsmith
+ docs_from_langgraph
)
docs_transformed = [doc for doc in docs_transformed if len(doc.page_content) > 10]

Expand Down
Loading