Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 11 additions & 6 deletions backend/examples/cli_research.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
from langchain_core.messages import HumanMessage
from agent.graph import graph


def main() -> None:
"""Run the research agent from the command line."""
parser = argparse.ArgumentParser(description="Run the LangGraph research agent")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The docstring for the main function was removed. Per PEP 257, it's good practice to include docstrings for all public functions to improve readability and maintainability. Please consider adding it back.

Suggested change
parser = argparse.ArgumentParser(description="Run the LangGraph research agent")
"""Run the research agent from the command line."""
parser = argparse.ArgumentParser(description="Run the LangGraph research agent")

parser.add_argument("question", help="Research question")
parser.add_argument(
Expand All @@ -21,23 +19,30 @@ def main() -> None:
)
parser.add_argument(
"--reasoning-model",
default="gemini-2.5-pro-preview-05-06",
default="llama-3.3-70b-versatile",
help="Model for the final answer",
)
parser.add_argument(
"--dir",
type=str,
help="Directory to search in",
default=None
)
Comment on lines +13 to +17

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The formatting for this add_argument call is inconsistent with the others in this file and not well-aligned. For improved readability and consistency, please format it as a single line, similar to the other arguments.

    parser.add_argument("--dir", type=str, help="Directory to search in", default=None)

args = parser.parse_args()

config = {"configurable": {"search_dir": args.dir}}

state = {
"messages": [HumanMessage(content=args.question)],
"initial_search_query_count": args.initial_queries,
"max_research_loops": args.max_loops,
"reasoning_model": args.reasoning_model,
}

result = graph.invoke(state)
result = graph.invoke(state, config=config)
messages = result.get("messages", [])
if messages:
print(messages[-1].content)


if __name__ == "__main__":
main()
main()
137 changes: 56 additions & 81 deletions backend/src/agent/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

from agent.tools_and_schemas import SearchQueryList, Reflection
from dotenv import load_dotenv
from langchain_core.messages import AIMessage
from langchain_core.messages import AIMessage, SystemMessage
from langgraph.types import Send
from langgraph.graph import StateGraph
from langgraph.graph import START, END
from langchain_core.runnables import RunnableConfig
from google.genai import Client
from langchain_groq import ChatGroq

from agent.state import (
OverallState,
Expand All @@ -23,7 +23,7 @@
reflection_instructions,
answer_instructions,
)
from langchain_google_genai import ChatGoogleGenerativeAI
from google.genai import Client
from agent.utils import (
get_citations,
get_research_topic,
Expand All @@ -36,15 +36,24 @@
if os.getenv("GEMINI_API_KEY") is None:
raise ValueError("GEMINI_API_KEY is not set")

if os.getenv("GROQ_API_KEY") is None:
raise ValueError("GROQ_API_KEY is not set")

# Used for Google Search API
genai_client = Client(api_key=os.getenv("GEMINI_API_KEY"))

# Initialize Groq LLM for research tasks
groq_llm = ChatGroq(
model="llama-3.3-70b-versatile",
temperature=0,
api_key=os.getenv("GROQ_API_KEY")
)

# Nodes
def generate_query(state: OverallState, config: RunnableConfig) -> QueryGenerationState:
"""LangGraph node that generates search queries based on the User's question.

Uses Gemini 2.0 Flash to create an optimized search queries for web research based on
Uses Llama 3.3 70B via Groq to create an optimized search queries for web research based on
the User's question.

Args:
Expand All @@ -60,14 +69,8 @@ def generate_query(state: OverallState, config: RunnableConfig) -> QueryGenerati
if state.get("initial_search_query_count") is None:
state["initial_search_query_count"] = configurable.number_of_initial_queries

# init Gemini 2.0 Flash
llm = ChatGoogleGenerativeAI(
model=configurable.query_generator_model,
temperature=1.0,
max_retries=2,
api_key=os.getenv("GEMINI_API_KEY"),
)
structured_llm = llm.with_structured_output(SearchQueryList)
# init Groq structured output
structured_llm = groq_llm.with_structured_output(SearchQueryList)

# Format the prompt
current_date = get_current_date()
Expand All @@ -93,46 +96,47 @@ def continue_to_web_research(state: QueryGenerationState):


def web_research(state: WebSearchState, config: RunnableConfig) -> OverallState:
"""LangGraph node that performs web research using the native Google Search API tool.

Executes a web search using the native Google Search API tool in combination with Gemini 2.0 Flash.
"""LangGraph node that performs local directory research instead of Google Search.

Args:
state: Current graph state containing the search query and research loop count
config: Configuration for the runnable, including search API settings
state: Current graph state containing the search query
config: Configuration containing the 'search_dir' parameter

Returns:
Dictionary with state update, including sources_gathered, research_loop_count, and web_research_results
Dictionary with state update, including sources_gathered and web_research_results
"""
# Configure
configurable = Configuration.from_runnable_config(config)
formatted_prompt = web_searcher_instructions.format(
current_date=get_current_date(),
research_topic=state["search_query"],
)

# Uses the google genai client as the langchain client doesn't return grounding metadata
response = genai_client.models.generate_content(
model=configurable.query_generator_model,
contents=formatted_prompt,
config={
"tools": [{"google_search": {}}],
"temperature": 0,
},
)
# resolve the urls to short urls for saving tokens and time
resolved_urls = resolve_urls(
response.candidates[0].grounding_metadata.grounding_chunks, state["id"]
)
# Gets the citations and adds them to the generated text
citations = get_citations(response, resolved_urls)
modified_text = insert_citation_markers(response.text, citations)
sources_gathered = [item for citation in citations for item in citation["segments"]]
# Get the search directory from the config passed via cli_research.py
search_dir = config.get("configurable", {}).get("search_dir")

if not search_dir:
return {"messages": [SystemMessage(content="Error: Directory for local search not specified.")]}

results = []
try:
# Recursively crawl the directory
for root, dirs, files in os.walk(search_dir):
for file in files:
# Target common text-based files
if file.endswith((".txt", ".md", ".py", ".json", ".csv")):
path = os.path.join(root, file)
try:
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
# Check if the generated search query exists in the file content
query = state.get("search_query")
if query and query.lower() in content.lower():
results.append(f"Source: {path}\nContent snippet: {content[:500]}...\n")
except Exception:
continue

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Catching a broad Exception and silently continuing with continue can hide critical bugs and make debugging very difficult. It's much safer to catch only the specific exceptions you expect from file operations, such as IOError or OSError. This prevents masking unexpected issues.

Suggested change
except Exception:
continue
except (IOError, OSError):
continue

except Exception as e:
return {"messages": [SystemMessage(content=f"FileSystem Error: {str(e)}")]}

final_results = "\n".join(results) if results else "No relevant information found in the specified local directory."

return {
"sources_gathered": sources_gathered,
"sources_gathered": [], # Local files don't need complex grounding metadata for this task
"search_query": [state["search_query"]],
"web_research_result": [modified_text],
"web_research_result": [final_results],
}


Expand All @@ -150,10 +154,8 @@ def reflection(state: OverallState, config: RunnableConfig) -> ReflectionState:
Returns:
Dictionary with state update, including search_query key containing the generated follow-up query
"""
configurable = Configuration.from_runnable_config(config)
# Increment the research loop count and get the reasoning model
state["research_loop_count"] = state.get("research_loop_count", 0) + 1
reasoning_model = state.get("reasoning_model", configurable.reflection_model)

# Format the prompt
current_date = get_current_date()
Expand All @@ -162,14 +164,8 @@ def reflection(state: OverallState, config: RunnableConfig) -> ReflectionState:
research_topic=get_research_topic(state["messages"]),
summaries="\n\n---\n\n".join(state["web_research_result"]),
)
# init Reasoning Model
llm = ChatGoogleGenerativeAI(
model=reasoning_model,
temperature=1.0,
max_retries=2,
api_key=os.getenv("GEMINI_API_KEY"),
)
result = llm.with_structured_output(Reflection).invoke(formatted_prompt)
# init Reasoning Model via Groq
result = groq_llm.with_structured_output(Reflection).invoke(formatted_prompt)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The reasoning_model passed in the state is being ignored here. The node now unconditionally uses the global groq_llm instance, which has a hardcoded model. This makes the --reasoning-model CLI argument ineffective for the reflection step. To honor the user's configuration, you should dynamically initialize a ChatGroq instance using the model from the state.

    reasoning_model = state.get("reasoning_model") or "llama-3.3-70b-versatile"
    llm = ChatGroq(
        model=reasoning_model,
        temperature=1.0,
        api_key=os.getenv("GROQ_API_KEY"),
    )
    result = llm.with_structured_output(Reflection).invoke(formatted_prompt)


return {
"is_sufficient": result.is_sufficient,
Expand Down Expand Up @@ -220,19 +216,15 @@ def evaluate_research(
def finalize_answer(state: OverallState, config: RunnableConfig):
"""LangGraph node that finalizes the research summary.

Prepares the final output by deduplicating and formatting sources, then
combining them with the running summary to create a well-structured
research report with proper citations.
Prepares the final output by combining sources gathered with the running summary
to create a well-structured research report.

Args:
state: Current graph state containing the running summary and sources gathered

Returns:
Dictionary with state update, including running_summary key containing the formatted final summary with sources
Dictionary with state update, including running_summary key containing the formatted final summary
"""
configurable = Configuration.from_runnable_config(config)
reasoning_model = state.get("reasoning_model") or configurable.answer_model

# Format the prompt
current_date = get_current_date()
formatted_prompt = answer_instructions.format(
Expand All @@ -241,27 +233,11 @@ def finalize_answer(state: OverallState, config: RunnableConfig):
summaries="\n---\n\n".join(state["web_research_result"]),
)

# init Reasoning Model, default to Gemini 2.5 Flash
llm = ChatGoogleGenerativeAI(
model=reasoning_model,
temperature=0,
max_retries=2,
api_key=os.getenv("GEMINI_API_KEY"),
)
result = llm.invoke(formatted_prompt)

# Replace the short urls with the original urls and add all used urls to the sources_gathered
unique_sources = []
for source in state["sources_gathered"]:
if source["short_url"] in result.content:
result.content = result.content.replace(
source["short_url"], source["value"]
)
unique_sources.append(source)
# init Reasoning Model via Groq
result = groq_llm.invoke(formatted_prompt)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Similar to the reflection node, the reasoning_model from the state is being ignored. This node unconditionally uses the global groq_llm instance with its hardcoded model, rendering the --reasoning-model CLI argument useless for this final step. Please initialize a ChatGroq instance dynamically with the model from the state.

    reasoning_model = state.get("reasoning_model") or "llama-3.3-70b-versatile"
    llm = ChatGroq(
        model=reasoning_model,
        temperature=0,
        api_key=os.getenv("GROQ_API_KEY"),
    )
    result = llm.invoke(formatted_prompt)


return {
"messages": [AIMessage(content=result.content)],
"sources_gathered": unique_sources,
}


Expand All @@ -275,7 +251,6 @@ def finalize_answer(state: OverallState, config: RunnableConfig):
builder.add_node("finalize_answer", finalize_answer)

# Set the entrypoint as `generate_query`
# This means that this node is the first one called
builder.add_edge(START, "generate_query")
# Add conditional edge to continue with search queries in a parallel branch
builder.add_conditional_edges(
Expand All @@ -290,4 +265,4 @@ def finalize_answer(state: OverallState, config: RunnableConfig):
# Finalize the answer
builder.add_edge("finalize_answer", END)

graph = builder.compile(name="pro-search-agent")
graph = builder.compile(name="pro-search-agent")