Skip to content

Commit afc1718

Browse files
authored
Added audio_timestamp support. (#913)
1 parent 5009942 commit afc1718

File tree

3 files changed

+37
-0
lines changed

3 files changed

+37
-0
lines changed

libs/vertexai/langchain_google_vertexai/_base.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,10 @@ class _VertexAICommon(_VertexAIBase):
246246
thinking_budget: Optional[int] = Field(
247247
default=None, description="Indicates the thinking budget in tokens."
248248
)
249+
audio_timestamp: Optional[bool] = Field(
250+
default=None,
251+
description="Enable timestamp understanding of audio-only files",
252+
)
249253

250254
@property
251255
def _is_gemini_model(self) -> bool:

libs/vertexai/langchain_google_vertexai/chat_models.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,7 @@
168168
"response_logprobs",
169169
"logprobs",
170170
"labels",
171+
"audio_timestamp",
171172
"thinking_budget",
172173
]
173174
_allowed_params_prediction_service = ["request", "timeout", "metadata", "labels"]
@@ -1355,6 +1356,10 @@ def _prepare_params(
13551356
gapic_response_schema = _convert_schema_dict_to_gapic(response_schema)
13561357
params["response_schema"] = gapic_response_schema
13571358

1359+
audio_timestamp = kwargs.get("audio_timestamp", self.audio_timestamp)
1360+
if audio_timestamp is not None:
1361+
params["audio_timestamp"] = audio_timestamp
1362+
13581363
thinking_budget = kwargs.get("thinking_budget", self.thinking_budget)
13591364
if thinking_budget is not None:
13601365
params["thinking_config"] = {"thinking_budget": thinking_budget}

libs/vertexai/tests/integration_tests/test_chat_models.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import io
55
import json
66
import os
7+
import re
78
from typing import List, Literal, Optional, cast
89

910
import pytest
@@ -342,6 +343,33 @@ def get_climate_info(query: str):
342343
assert isinstance(output["output"], str)
343344

344345

346+
def test_audio_timestamp():
347+
storage_client = storage.Client()
348+
llm = ChatVertexAI(model_name=_DEFAULT_MODEL_NAME, rate_limiter=rate_limiter)
349+
350+
file_uri = (
351+
"gs://cloud-samples-data/generative-ai/audio/audio_summary_clean_energy.mp3"
352+
)
353+
mime_type = "audio/mp3"
354+
blob = storage.Blob.from_string(file_uri, client=storage_client)
355+
media_base64 = base64.b64encode(blob.download_as_bytes()).decode()
356+
media_message = {
357+
"type": "media",
358+
"data": media_base64,
359+
"mime_type": mime_type,
360+
}
361+
instruction = """
362+
Transcribe the video.
363+
"""
364+
text_message = {"type": "text", "text": instruction}
365+
366+
message = HumanMessage(content=[media_message, text_message])
367+
output = llm.invoke([message], audio_timestamp=True)
368+
369+
assert isinstance(output.content, str)
370+
assert re.search(r"^\d+:\d+", output.content)
371+
372+
345373
def test_parse_history_gemini_multimodal_FC():
346374
storage_client = storage.Client()
347375
# Can't use the pixel.mp3, since it has too many tokens it will hit quota

0 commit comments

Comments
 (0)