Skip to content

Commit 8db1233

Browse files
committed
supports pdf/txt file uploading & embedding
1 parent f08bbf4 commit 8db1233

13 files changed

+88934
-122695
lines changed

.vscode/launch.json

-11
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,5 @@
88
"program": "frontend/lib/main.dart",
99
"deviceId": "edge"
1010
},
11-
{
12-
"name": "(Windows) Launch",
13-
"type": "cppvsdbg",
14-
"request": "launch",
15-
"program": "enter program name, for example ${workspaceFolder}/a.exe",
16-
"args": [],
17-
"stopAtEntry": false,
18-
"cwd": "${fileDirname}",
19-
"environment": [],
20-
"console": "externalTerminal"
21-
}
2211
]
2312
}

app/utils/chatgpt/chatgpt_commands.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from uuid import uuid4
77

88
from fastapi import WebSocket
9-
109
from app.errors.api_exceptions import InternalServerError
1110
from app.utils.chatgpt.chatgpt_buffer import BufferedUserContext
1211
from app.utils.chatgpt.chatgpt_cache_manager import ChatGptCacheManager
@@ -534,7 +533,7 @@ async def testchaining(chain_size: int, buffer: BufferedUserContext) -> Tuple[st
534533
return f"/testchaining {chain_size-1}", ResponseType.REPEAT_COMMAND
535534

536535
@staticmethod
537-
@CommandResponse.send_message_and_stop
536+
@CommandResponse.handle_gpt
538537
async def query(query: str, /, buffer: BufferedUserContext) -> None:
539538
"""Query from redis vectorstore\n
540539
/query <query>"""
@@ -566,6 +565,5 @@ async def embed(text_to_embed: str, /) -> str:
566565
/embed <text_to_embed>"""
567566
await VectorStoreManager.create_documents(
568567
text=text_to_embed,
569-
chunk_size=500,
570568
)
571569
return "Embedding successful!"
+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import io
2+
from typing import IO, Any
3+
4+
from langchain.document_loaders.unstructured import UnstructuredBaseLoader
5+
from langchain.docstore.document import Document
6+
from unstructured.partition.auto import partition
7+
8+
9+
class UnstructuredFileIOLoader(UnstructuredBaseLoader):
10+
"""Loader that uses unstructured to load file IO objects."""
11+
12+
def __init__(self, file: IO, filename: str, mode: str = "single", **unstructured_kwargs: Any):
13+
"""Initialize with file path."""
14+
self.file = file
15+
self.filename = filename
16+
super().__init__(mode=mode, **unstructured_kwargs)
17+
18+
def _get_elements(self) -> list:
19+
return partition(file=self.file, file_filename=self.filename, **self.unstructured_kwargs)
20+
21+
def _get_metadata(self) -> dict:
22+
return {}
23+
24+
25+
def read_bytes_to_documents(file: bytes, filename: str) -> list[Document]:
26+
return UnstructuredFileIOLoader(file=io.BytesIO(file), filename=filename).load()
27+
28+
29+
def read_bytes_to_text(file: bytes, filename: str) -> str:
30+
return "\n\n".join([doc.page_content for doc in read_bytes_to_documents(file=file, filename=filename)])
31+
32+
33+
if __name__ == "__main__":
34+
with open(r"test.pdf", "rb") as f:
35+
file = f.read()
36+
text = read_bytes_to_text(file, "test.pdf")
37+
print(text)

app/utils/chatgpt/chatgpt_stream_manager.py

+35-5
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,20 @@
11
from fastapi import WebSocket, WebSocketDisconnect
2-
2+
from fastapi.concurrency import run_in_threadpool
3+
from pydantic import ValidationError
34
from app.errors.gpt_exceptions import GptOtherException, GptTextGenerationException, GptTooMuchTokenException
5+
from app.utils.chatgpt.chatgpt_buffer import BufferedUserContext
46
from app.utils.chatgpt.chatgpt_cache_manager import ChatGptCacheManager
57
from app.utils.chatgpt.chatgpt_commands import (
8+
ChatGptCommands,
9+
command_handler,
610
create_new_chat_room,
711
get_contexts_sorted_from_recent_to_past,
8-
command_handler,
912
)
13+
from app.utils.chatgpt.chatgpt_fileloader import read_bytes_to_text
1014
from app.utils.chatgpt.chatgpt_message_manager import MessageManager
15+
from app.utils.chatgpt.chatgpt_vectorstore_manager import VectorStoreManager
1116
from app.utils.chatgpt.chatgpt_websocket_manager import HandleMessage, SendToWebsocket
1217
from app.utils.logger import api_logger
13-
from app.utils.chatgpt.chatgpt_buffer import BufferedUserContext
1418
from app.viewmodels.base_models import MessageFromWebsocket, MessageToWebsocket
1519
from app.viewmodels.gpt_models import GptRoles
1620

@@ -33,8 +37,20 @@ async def begin_chat(
3337

3438
while True: # loop until connection is closed
3539
try:
36-
# receive message from websocket
37-
received: MessageFromWebsocket = MessageFromWebsocket.parse_raw(await websocket.receive_text())
40+
rcvd: dict = await websocket.receive_json()
41+
assert isinstance(rcvd, dict)
42+
if "filename" in rcvd:
43+
text: str = await run_in_threadpool(
44+
read_bytes_to_text, await websocket.receive_bytes(), rcvd["filename"]
45+
)
46+
docs: list[str] = await VectorStoreManager.create_documents(text)
47+
await SendToWebsocket.message(
48+
websocket=websocket,
49+
msg=f"Successfully embedded documents. You uploaded file begins with...\n\n```{docs[0][:50]}```...",
50+
chat_room_id=buffer.current_chat_room_id,
51+
)
52+
continue
53+
received: MessageFromWebsocket = MessageFromWebsocket(**rcvd)
3854

3955
if received.chat_room_id != buffer.current_chat_room_id: # change chat room
4056
index: int | None = buffer.find_index_of_chatroom(received.chat_room_id)
@@ -76,6 +92,20 @@ async def begin_chat(
7692

7793
except WebSocketDisconnect:
7894
raise WebSocketDisconnect(code=1000, reason="client disconnected")
95+
except (AssertionError, ValidationError):
96+
await SendToWebsocket.message(
97+
websocket=websocket,
98+
msg="Invalid message. Message is not in the correct format, maybe frontend - backend version mismatch?",
99+
chat_room_id=buffer.current_chat_room_id,
100+
)
101+
continue
102+
except ValueError:
103+
await SendToWebsocket.message(
104+
websocket=websocket,
105+
msg="Invalid file type.",
106+
chat_room_id=buffer.current_chat_room_id,
107+
)
108+
continue
79109
except GptTextGenerationException:
80110
await MessageManager.rpop_message_history_safely(
81111
user_gpt_context=buffer.current_user_gpt_context, role=GptRoles.USER

app/utils/chatgpt/chatgpt_vectorstore_manager.py

+9-8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from asyncio import gather
22
from langchain.text_splitter import TokenTextSplitter
3+
from langchain.document_loaders import UnstructuredFileLoader
34
from app.utils.langchain.redis_vectorstore import Document
45
from app.database.connection import cache
56

@@ -8,17 +9,17 @@ class VectorStoreManager:
89
@staticmethod
910
async def create_documents(
1011
text: str,
11-
chunk_size: int,
12+
chunk_size: int = 500,
1213
chunk_overlap: int = 0,
1314
tokenizer_model: str = "gpt-3.5-turbo",
1415
) -> list[str]:
15-
return await cache.vectorstore.aadd_texts(
16-
texts=TokenTextSplitter(
17-
chunk_size=chunk_size,
18-
chunk_overlap=chunk_overlap,
19-
model_name=tokenizer_model,
20-
).split_text(text)
21-
)
16+
texts = TokenTextSplitter(
17+
chunk_size=chunk_size,
18+
chunk_overlap=chunk_overlap,
19+
model_name=tokenizer_model,
20+
).split_text(text)
21+
await cache.vectorstore.aadd_texts(texts=texts)
22+
return texts
2223

2324
@staticmethod
2425
async def asimilarity_search(queries: list[str], k: int = 1) -> list[list[Document]]:

app/utils/langchain/redis_vectorstore.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33

44
import json
55
import logging
6-
from uuid import uuid4
76
from enum import Enum
87
from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Mapping, Optional, Tuple, Type, Union
8+
from uuid import uuid4
99

1010
import numpy as np
1111
import pkg_resources
@@ -16,6 +16,10 @@
1616
from langchain.vectorstores.base import VectorStore
1717
from pydantic import BaseModel, root_validator
1818

19+
try:
20+
from starlette.concurrency import run_in_threadpool
21+
except ImportError:
22+
raise ImportError("Please install starlette to use the Redis vector store. " "pip install starlette")
1923
try:
2024
import redis
2125

@@ -312,7 +316,7 @@ async def aadd_texts(
312316
**kwargs: Any,
313317
) -> List[str]:
314318
"""Add texts data to an existing index."""
315-
ids, pipeline = self._add_texts(texts, metadatas, **kwargs)
319+
ids, pipeline = await run_in_threadpool(self._add_texts, texts, metadatas, **kwargs)
316320
await pipeline.execute() # type: ignore
317321
return ids
318322

@@ -448,7 +452,7 @@ async def asimilarity_search_with_score(self, query: str, k: int = 4) -> List[Tu
448452
Returns:
449453
List of Documents most similar to the query and score for each
450454
"""
451-
redis_query, params_dict = self._similarity_search_with_score(query, k=k)
455+
redis_query, params_dict = await run_in_threadpool(self._similarity_search_with_score, query, k)
452456

453457
# perform vector search
454458
results = await self.client.ft(self.index_name).search(redis_query, params_dict) # type: ignore

app/viewmodels/base_models.py

-3
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,6 @@ class MessageFromWebsocket(BaseModel):
118118
translate: bool
119119
chat_room_id: str
120120

121-
class Config:
122-
orm_mode = True
123-
124121

125122
class CreateChatRoom(BaseModel): # stub
126123
chat_room_type: str

app/web/.last_build_id

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
6fe4c07d32d7ee85313bac15705fffd4
1+
40f065f758a32903d20b775d3e3f717e

app/web/assets/NOTICES

+25
Original file line numberDiff line numberDiff line change
@@ -6949,6 +6949,30 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
69496949
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
69506950
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
69516951
--------------------------------------------------------------------------------
6952+
file_picker
6953+
6954+
MIT License
6955+
6956+
Copyright (c) 2018 Miguel Ruivo
6957+
6958+
Permission is hereby granted, free of charge, to any person obtaining a copy
6959+
of this software and associated documentation files (the "Software"), to deal
6960+
in the Software without restriction, including without limitation the rights
6961+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
6962+
copies of the Software, and to permit persons to whom the Software is
6963+
furnished to do so, subject to the following conditions:
6964+
6965+
The above copyright notice and this permission notice shall be included in all
6966+
copies or substantial portions of the Software.
6967+
6968+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
6969+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
6970+
FITNESS FOR A PARTICULAR PURPOSE AND NON INFRINGEMENT. IN NO EVENT SHALL THE
6971+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
6972+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
6973+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
6974+
SOFTWARE.
6975+
--------------------------------------------------------------------------------
69526976
flatbuffers
69536977

69546978
Apache License
@@ -7225,6 +7249,7 @@ SOFTWARE.
72257249
--------------------------------------------------------------------------------
72267250
flutter_lints
72277251
flutter_markdown
7252+
flutter_plugin_android_lifecycle
72287253
path_provider
72297254
path_provider_android
72307255
path_provider_foundation

app/web/flutter_service_worker.js

+4-4
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ const RESOURCES = {
1010
"assets/assets/images/user_profile.png": "62ec3ddb01749f1173ca5e2972504c03",
1111
"assets/FontManifest.json": "dc3d03800ccca4601324923c0b1d6d57",
1212
"assets/fonts/MaterialIcons-Regular.otf": "e7069dfd19b331be16bed984668fe080",
13-
"assets/NOTICES": "5f187725cbd4ba575eb9ce64fefe3a4a",
13+
"assets/NOTICES": "b67701d66ab4415275394db0a602d216",
1414
"assets/packages/cupertino_icons/assets/CupertinoIcons.ttf": "6d342eb68f170c97609e9da345464e5e",
1515
"canvaskit/canvaskit.js": "97937cb4c2c2073c968525a3e08c86a3",
1616
"canvaskit/canvaskit.wasm": "3de12d898ec208a5f31362cc00f09b9e",
@@ -22,9 +22,9 @@ const RESOURCES = {
2222
"icons/Icon-512.png": "c9242a8fc92d16303b04ac108c9ac581",
2323
"icons/Icon-maskable-192.png": "710e5d264b437fe72bdb99ba514e2266",
2424
"icons/Icon-maskable-512.png": "c9242a8fc92d16303b04ac108c9ac581",
25-
"index.html": "97477ffe63d8e831119b544c3173d940",
26-
"/": "97477ffe63d8e831119b544c3173d940",
27-
"main.dart.js": "c5d0085759139c968f0397db84d37860",
25+
"index.html": "00bbbe3b446dd2ffa5ee43d4e0902e3e",
26+
"/": "00bbbe3b446dd2ffa5ee43d4e0902e3e",
27+
"main.dart.js": "0214b9f8354e4111b7edb14011b9364c",
2828
"manifest.json": "7e9cc88c98034abad925a9f0f78c53d4",
2929
"site_logo.png": "0fb92091fe8696aa5cfeafb0d1a01cbc",
3030
"version.json": "9094aacdae789dccd67fa32109ff1a18"

app/web/index.html

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
<script>
2424
// The value below is injected by flutter build, do not touch.
25-
var serviceWorkerVersion = '1091727571';
25+
var serviceWorkerVersion = '1925346675';
2626
</script>
2727
<!-- This script adds the flutter initialization JS code -->
2828
<script src="flutter.js" defer></script>

0 commit comments

Comments
 (0)