Skip to content

Commit c17d025

Browse files
committed
update testing
1 parent 87029c2 commit c17d025

File tree

3 files changed

+56
-33
lines changed

3 files changed

+56
-33
lines changed

backend/retrieval_graph/configuration.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ class AgentConfiguration(BaseConfiguration):
2222
)
2323

2424
response_model: str = field(
25-
default="anthropic/claude-3-5-sonnet-20240620",
25+
default="openai/gpt-4o-mini",
2626
metadata={
2727
"description": "The language model used for generating responses. Should be in the form: provider/model-name."
2828
},

backend/retrieval_graph/graph.py

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
conducting research, and formulating responses.
77
"""
88

9-
from typing import Any, Literal, TypedDict, cast
9+
from typing import Any, Literal, Type, TypedDict, cast
1010

1111
from langchain_core.messages import BaseMessage
1212
from langchain_core.runnables import RunnableConfig
@@ -33,6 +33,10 @@ async def analyze_and_route_query(
3333
Returns:
3434
dict[str, Router]: A dictionary containing the 'router' key with the classification result (classification type and logic).
3535
"""
36+
# allow skipping the router for testing
37+
if state.router and state.router["logic"]:
38+
return {"router": state.router}
39+
3640
configuration = AgentConfiguration.from_runnable_config(config)
3741
model = load_chat_model(configuration.query_model)
3842
messages = [
@@ -207,22 +211,31 @@ async def respond(
207211

208212

209213
# Define the graph
210-
builder = StateGraph(AgentState, input=InputState, config_schema=AgentConfiguration)
211-
builder.add_node(analyze_and_route_query)
212-
builder.add_node(ask_for_more_info)
213-
builder.add_node(respond_to_general_query)
214-
builder.add_node(conduct_research)
215-
builder.add_node(create_research_plan)
216-
builder.add_node(respond)
217-
218-
builder.add_edge(START, "analyze_and_route_query")
219-
builder.add_conditional_edges("analyze_and_route_query", route_query)
220-
builder.add_edge("create_research_plan", "conduct_research")
221-
builder.add_conditional_edges("conduct_research", check_finished)
222-
builder.add_edge("ask_for_more_info", END)
223-
builder.add_edge("respond_to_general_query", END)
224-
builder.add_edge("respond", END)
225-
226-
# Compile into a graph object that you can invoke and deploy.
227-
graph = builder.compile()
228-
graph.name = "RetrievalGraph"
214+
215+
216+
def make_graph(*, input_schema: Type[Any]):
217+
builder = StateGraph(
218+
AgentState, input=input_schema, config_schema=AgentConfiguration
219+
)
220+
builder.add_node(analyze_and_route_query)
221+
builder.add_node(ask_for_more_info)
222+
builder.add_node(respond_to_general_query)
223+
builder.add_node(conduct_research)
224+
builder.add_node(create_research_plan)
225+
builder.add_node(respond)
226+
227+
builder.add_edge(START, "analyze_and_route_query")
228+
builder.add_conditional_edges("analyze_and_route_query", route_query)
229+
builder.add_edge("create_research_plan", "conduct_research")
230+
builder.add_conditional_edges("conduct_research", check_finished)
231+
builder.add_edge("ask_for_more_info", END)
232+
builder.add_edge("respond_to_general_query", END)
233+
builder.add_edge("respond", END)
234+
235+
# Compile into a graph object that you can invoke and deploy.
236+
graph = builder.compile()
237+
graph.name = "RetrievalGraph"
238+
return graph
239+
240+
241+
graph = make_graph(input_schema=InputState)

backend/tests/evals/test_e2e.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import asyncio
12
from typing import Any
23

34
import pandas as pd
@@ -6,10 +7,11 @@
67
from langchain_core.messages import AIMessage
78
from langchain_core.prompts import ChatPromptTemplate
89
from langchain_core.pydantic_v1 import BaseModel, Field
9-
from langsmith.evaluation import EvaluationResults, evaluate
10+
from langsmith.evaluation import EvaluationResults, aevaluate
1011
from langsmith.schemas import Example, Run
1112

12-
from backend.retrieval_graph.graph import graph
13+
from backend.retrieval_graph.graph import make_graph
14+
from backend.retrieval_graph.state import AgentState, Router
1315
from backend.utils import format_docs
1416

1517
DATASET_NAME = "chat-langchain-qa"
@@ -141,10 +143,16 @@ def evaluate_qa_context(run: Run, example: Example) -> dict:
141143

142144
# Run evaluation
143145

146+
# TODO: this is a hack to allow for skipping the router for testing. Add testing for individual components.
147+
graph = make_graph(input_schema=AgentState)
144148

145-
def run_graph(inputs: dict[str, Any]) -> dict[str, Any]:
146-
results = graph.invoke(
147-
{"messages": [("human", inputs["question"])]},
149+
150+
async def run_graph(inputs: dict[str, Any]) -> dict[str, Any]:
151+
results = await graph.ainvoke(
152+
{
153+
"messages": [("human", inputs["question"])],
154+
"router": Router(type="langchain", logic="The question is about LangChain"),
155+
}
148156
)
149157
return results
150158

@@ -162,13 +170,15 @@ def convert_single_example_results(evaluation_results: EvaluationResults):
162170
# NOTE: this is more of a regression test
163171
def test_scores_regression():
164172
# test most commonly used model
165-
experiment_results = evaluate(
166-
lambda inputs: run_graph(inputs),
167-
data=DATASET_NAME,
168-
evaluators=[evaluate_retrieval_recall, evaluate_qa, evaluate_qa_context],
169-
experiment_prefix=EXPERIMENT_PREFIX,
170-
metadata={"judge_model_name": JUDGE_MODEL_NAME},
171-
max_concurrency=4,
173+
experiment_results = asyncio.run(
174+
aevaluate(
175+
run_graph,
176+
data=DATASET_NAME,
177+
evaluators=[evaluate_retrieval_recall, evaluate_qa, evaluate_qa_context],
178+
experiment_prefix=EXPERIMENT_PREFIX,
179+
metadata={"judge_model_name": JUDGE_MODEL_NAME},
180+
max_concurrency=4,
181+
)
172182
)
173183
experiment_result_df = pd.DataFrame(
174184
convert_single_example_results(result["evaluation_results"])

0 commit comments

Comments
 (0)