Added audio_timestamp support. (#913)

windkit · web-flow · commit afc1718cce5d · 2025-05-15T12:17:15.000+02:00
diff --git a/libs/vertexai/langchain_google_vertexai/_base.py b/libs/vertexai/langchain_google_vertexai/_base.py
@@ -246,6 +246,10 @@ class _VertexAICommon(_VertexAIBase):
     thinking_budget: Optional[int] = Field(
         default=None, description="Indicates the thinking budget in tokens."
     )
+    audio_timestamp: Optional[bool] = Field(
+        default=None,
+        description="Enable timestamp understanding of audio-only files",
+    )
 
     @property
     def _is_gemini_model(self) -> bool:
diff --git a/libs/vertexai/langchain_google_vertexai/chat_models.py b/libs/vertexai/langchain_google_vertexai/chat_models.py
@@ -168,6 +168,7 @@
     "response_logprobs",
     "logprobs",
     "labels",
+    "audio_timestamp",
     "thinking_budget",
 ]
 _allowed_params_prediction_service = ["request", "timeout", "metadata", "labels"]
@@ -1355,6 +1356,10 @@ def _prepare_params(
             gapic_response_schema = _convert_schema_dict_to_gapic(response_schema)
             params["response_schema"] = gapic_response_schema
 
+        audio_timestamp = kwargs.get("audio_timestamp", self.audio_timestamp)
+        if audio_timestamp is not None:
+            params["audio_timestamp"] = audio_timestamp
+
         thinking_budget = kwargs.get("thinking_budget", self.thinking_budget)
         if thinking_budget is not None:
             params["thinking_config"] = {"thinking_budget": thinking_budget}
diff --git a/libs/vertexai/tests/integration_tests/test_chat_models.py b/libs/vertexai/tests/integration_tests/test_chat_models.py
@@ -4,6 +4,7 @@
 import io
 import json
 import os
+import re
 from typing import List, Literal, Optional, cast
 
 import pytest
@@ -342,6 +343,33 @@ def get_climate_info(query: str):
     assert isinstance(output["output"], str)
 
 
+def test_audio_timestamp():
+    storage_client = storage.Client()
+    llm = ChatVertexAI(model_name=_DEFAULT_MODEL_NAME, rate_limiter=rate_limiter)
+
+    file_uri = (
+        "gs://cloud-samples-data/generative-ai/audio/audio_summary_clean_energy.mp3"
+    )
+    mime_type = "audio/mp3"
+    blob = storage.Blob.from_string(file_uri, client=storage_client)
+    media_base64 = base64.b64encode(blob.download_as_bytes()).decode()
+    media_message = {
+        "type": "media",
+        "data": media_base64,
+        "mime_type": mime_type,
+    }
+    instruction = """
+    Transcribe the video.
+    """
+    text_message = {"type": "text", "text": instruction}
+
+    message = HumanMessage(content=[media_message, text_message])
+    output = llm.invoke([message], audio_timestamp=True)
+
+    assert isinstance(output.content, str)
+    assert re.search(r"^\d+:\d+", output.content)
+
+
 def test_parse_history_gemini_multimodal_FC():
     storage_client = storage.Client()
     # Can't use the pixel.mp3, since it has too many tokens it will hit quota