From cb153e8c7d35166ffeaa5d23ce4686c82435fc0b Mon Sep 17 00:00:00 2001
From: Minglei Wang <wangmlcn@gmail.com>
Date: Fri, 12 May 2023 22:14:54 +1000
Subject: [PATCH 1/2] Update links on README.MD, ingest.sh annd ingest.py
 Replace ChatVectorDBChain with ConversationalRetrievalChain Add tiktoken to
 requirements.txt

---
 .gitignore       |  4 +++-
 README.md        |  6 +++---
 ingest.py        |  2 +-
 ingest.sh        |  2 +-
 main.py          |  4 ++--
 query_data.py    | 16 ++++++++--------
 requirements.txt |  1 +
 7 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/.gitignore b/.gitignore
index 74ef9d7a8..06bf42bf9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -136,4 +136,6 @@ dmypy.json
 .DS_Store
 
 vectorstore.pkl
-langchain.readthedocs.io/
+python.langchain.com/
+
+.venv
diff --git a/README.md b/README.md
index fd2de7e20..8ff41f9c6 100644
--- a/README.md
+++ b/README.md
@@ -30,11 +30,11 @@ There are two components: ingestion and question-answering.
 Ingestion has the following steps:
 
 1. Pull html from documentation site
-2. Load html with LangChain's [ReadTheDocs Loader](https://langchain.readthedocs.io/en/latest/modules/document_loaders/examples/readthedocs_documentation.html)
-3. Split documents with LangChain's [TextSplitter](https://langchain.readthedocs.io/en/latest/reference/modules/text_splitter.html)
+2. Load html with LangChain's [ReadTheDocs Loader](https://python.langchain.com/en/latest/modules/document_loaders/examples/readthedocs_documentation.html)
+3. Split documents with LangChain's [TextSplitter](https://python.langchain.com/en/latest/reference/modules/text_splitter.html)
 4. Create a vectorstore of embeddings, using LangChain's [vectorstore wrapper](https://python.langchain.com/en/latest/modules/indexes/vectorstores.html) (with OpenAI's embeddings and FAISS vectorstore).
 
-Question-Answering has the following steps, all handled by [ChatVectorDBChain](https://langchain.readthedocs.io/en/latest/modules/indexes/chain_examples/chat_vector_db.html):
+Question-Answering has the following steps, all handled by [ChatVectorDBChain](https://python.langchain.com/en/latest/modules/indexes/chain_examples/chat_vector_db.html):
 
 1. Given the chat history and new user input, determine what a standalone question would be (using GPT-3).
 2. Given that standalone question, look up relevant documents from the vectorstore.
diff --git a/ingest.py b/ingest.py
index 148a8a5f4..d4075a4c6 100644
--- a/ingest.py
+++ b/ingest.py
@@ -9,7 +9,7 @@
 
 def ingest_docs():
     """Get documents from web pages."""
-    loader = ReadTheDocsLoader("langchain.readthedocs.io/en/latest/")
+    loader = ReadTheDocsLoader("python.langchain.com/en/latest/")
     raw_documents = loader.load()
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000,
diff --git a/ingest.sh b/ingest.sh
index 73b75a899..94ce31cfb 100755
--- a/ingest.sh
+++ b/ingest.sh
@@ -2,5 +2,5 @@
 # This involves scraping the data from the web and then cleaning up and putting in Weaviate.
 # Error if any command fails
 set -e
-wget -r -A.html https://langchain.readthedocs.io/en/latest/
+wget -r -A.html https://python.langchain.com/en/latest/
 python3 ingest.py
diff --git a/main.py b/main.py
index b829756ff..bda8993be 100644
--- a/main.py
+++ b/main.py
@@ -38,10 +38,10 @@ async def websocket_endpoint(websocket: WebSocket):
     question_handler = QuestionGenCallbackHandler(websocket)
     stream_handler = StreamingLLMCallbackHandler(websocket)
     chat_history = []
-    qa_chain = get_chain(vectorstore, question_handler, stream_handler)
+    qa_chain = await get_chain(vectorstore, question_handler, stream_handler)
     # Use the below line instead of the above line to enable tracing
     # Ensure `langchain-server` is running
-    # qa_chain = get_chain(vectorstore, question_handler, stream_handler, tracing=True)
+    # qa_chain = await get_chain(vectorstore, question_handler, stream_handler, tracing=True)
 
     while True:
         try:
diff --git a/query_data.py b/query_data.py
index c0028317f..2f043e4a7 100644
--- a/query_data.py
+++ b/query_data.py
@@ -1,7 +1,7 @@
 """Create a ChatVectorDBChain for question/answering."""
-from langchain.callbacks.base import AsyncCallbackManager
+from langchain.callbacks.manager import AsyncCallbackManager
 from langchain.callbacks.tracers import LangChainTracer
-from langchain.chains import ChatVectorDBChain
+from langchain.chains import ConversationalRetrievalChain
 from langchain.chains.chat_vector_db.prompts import (CONDENSE_QUESTION_PROMPT,
                                                      QA_PROMPT)
 from langchain.chains.llm import LLMChain
@@ -10,11 +10,11 @@
 from langchain.vectorstores.base import VectorStore
 
 
-def get_chain(
+async def get_chain(
     vectorstore: VectorStore, question_handler, stream_handler, tracing: bool = False
-) -> ChatVectorDBChain:
-    """Create a ChatVectorDBChain for question/answering."""
-    # Construct a ChatVectorDBChain with a streaming llm for combine docs
+) -> ConversationalRetrievalChain:
+    """Create a ConversationalRetrievalChain for question/answering."""
+    # Construct a ConversationalRetrievalChain with a streaming llm for combine docs
     # and a separate, non-streaming llm for question generation
     manager = AsyncCallbackManager([])
     question_manager = AsyncCallbackManager([question_handler])
@@ -45,8 +45,8 @@ def get_chain(
         streaming_llm, chain_type="stuff", prompt=QA_PROMPT, callback_manager=manager
     )
 
-    qa = ChatVectorDBChain(
-        vectorstore=vectorstore,
+    qa = ConversationalRetrievalChain(
+        retriever=vectorstore.as_retriever(),
         combine_docs_chain=doc_chain,
         question_generator=question_generator,
         callback_manager=manager,
diff --git a/requirements.txt b/requirements.txt
index 1b7831d96..6f0131c2a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,3 +11,4 @@ faiss-cpu
 bs4
 unstructured
 libmagic
+tiktoken
\ No newline at end of file

From fa603f652dbdf7f08bcd06c35da78c63b9f10534 Mon Sep 17 00:00:00 2001
From: Minglei Wang <wangmlcn@gmail.com>
Date: Sun, 14 May 2023 20:41:28 +1000
Subject: [PATCH 2/2] Rollback change on async get_chain

---
 main.py       | 2 +-
 query_data.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.py b/main.py
index bda8993be..ca8764c22 100644
--- a/main.py
+++ b/main.py
@@ -38,7 +38,7 @@ async def websocket_endpoint(websocket: WebSocket):
     question_handler = QuestionGenCallbackHandler(websocket)
     stream_handler = StreamingLLMCallbackHandler(websocket)
     chat_history = []
-    qa_chain = await get_chain(vectorstore, question_handler, stream_handler)
+    qa_chain = get_chain(vectorstore, question_handler, stream_handler)
     # Use the below line instead of the above line to enable tracing
     # Ensure `langchain-server` is running
     # qa_chain = await get_chain(vectorstore, question_handler, stream_handler, tracing=True)
diff --git a/query_data.py b/query_data.py
index 2f043e4a7..59f72aec3 100644
--- a/query_data.py
+++ b/query_data.py
@@ -10,7 +10,7 @@
 from langchain.vectorstores.base import VectorStore
 
 
-async def get_chain(
+def get_chain(
     vectorstore: VectorStore, question_handler, stream_handler, tracing: bool = False
 ) -> ConversationalRetrievalChain:
     """Create a ConversationalRetrievalChain for question/answering."""