Skip to content
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ dependencies = [
"google-cloud-spanner>=3.56.0, <4.0.0", # For Spanner database
"google-cloud-speech>=2.30.0, <3.0.0", # For Audio Transcription
"google-cloud-storage>=2.18.0, <4.0.0", # For GCS Artifact service
"google-genai>=1.64.0, <2.0.0", # Google GenAI SDK
"google-genai>=1.72.0, <2.0.0", # Google GenAI SDK
"graphviz>=0.20.2, <1.0.0", # Graphviz for graph rendering
"httpx>=0.27.0, <1.0.0", # HTTP client library
"jsonschema>=4.23.0, <5.0.0", # Agent Builder config validation
Expand Down
13 changes: 13 additions & 0 deletions src/google/adk/evaluation/eval_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,19 @@ class RubricsBasedCriterion(BaseCriterion):
),
)

evaluate_full_response: bool = Field(
default=False,
description=(
"Whether to evaluate the full agent response including intermediate"
" natural language text (e.g. text emitted before tool calls) in"
" addition to the final response. By default, only the final"
" response text is sent to the judge. When True, text from all"
" intermediate invocation events is concatenated with the final"
" response before evaluation. This is useful for agents that emit"
" text both before and after tool calls within a single invocation."
),
)


class HallucinationsCriterion(BaseCriterion):
"""Criterion to use when evaluating agents response for hallucinations."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,18 @@ def format_auto_rater_prompt(
"""Returns the autorater prompt."""
self.create_effective_rubrics_list(actual_invocation.rubrics)
user_input = get_text_from_content(actual_invocation.user_content)
final_response = get_text_from_content(actual_invocation.final_response)

# When evaluate_full_response is enabled, include text from intermediate
# invocation events (e.g. text emitted before tool calls) in addition to
# the final response. This is useful for agents that stream text, call
# tools, then stream more text within a single invocation.
criterion = self._eval_metric.criterion
evaluate_full = getattr(criterion, "evaluate_full_response", False)

if evaluate_full:
final_response = self._get_full_response_text(actual_invocation)
else:
final_response = get_text_from_content(actual_invocation.final_response)

rubrics_text = "\n".join([
f"* {r.rubric_content.text_property}"
Expand Down Expand Up @@ -310,3 +321,25 @@ def format_auto_rater_prompt(
)

return auto_rater_prompt

@staticmethod
def _get_full_response_text(invocation: Invocation) -> str:
"""Concatenates all NL text from invocation events and the final response.

When an agent emits text before a tool call (e.g. presenting a plan),
that text is stored in intermediate_data.invocation_events but not in
final_response. This method collects text from both sources to give the
judge a complete picture of the agent's output.
"""
parts = []
if invocation.intermediate_data and isinstance(
invocation.intermediate_data, InvocationEvents
):
for evt in invocation.intermediate_data.invocation_events:
text = get_text_from_content(evt.content)
if text:
parts.append(text)
final_text = get_text_from_content(invocation.final_response)
if final_text:
parts.append(final_text)
return "\n\n".join(parts)