-
Notifications
You must be signed in to change notification settings - Fork 5k
生成字幕 #1658
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
Jin-W-FS
wants to merge
2
commits into
RVC-Boss:main
Choose a base branch
from
Jin-W-FS:pullreq
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+94
−22
Open
生成字幕 #1658
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,6 +36,7 @@ | |
"split_bucket: True, # bool. whether to split the batch into multiple buckets. | ||
"speed_factor":1.0, # float. control the speed of the synthesized audio. | ||
"streaming_mode": False, # bool. whether to return a streaming response. | ||
"with_srt_format": "", # str. ""(no srt) or "raw" or "srt", "lrc", "vtt", ... formats (not implemented yet) | ||
"seed": -1, # int. random seed for reproducibility. | ||
"parallel_infer": True, # bool. whether to use parallel inference. | ||
"repetition_penalty": 1.35 # float. repetition penalty for T2S model. | ||
|
@@ -98,7 +99,7 @@ | |
import os | ||
import sys | ||
import traceback | ||
from typing import Generator | ||
from typing import Generator, List, Union | ||
|
||
now_dir = os.getcwd() | ||
sys.path.append(now_dir) | ||
|
@@ -162,6 +163,7 @@ class TTS_Request(BaseModel): | |
seed:int = -1 | ||
media_type:str = "wav" | ||
streaming_mode:bool = False | ||
with_srt_format:str = "" | ||
parallel_infer:bool = True | ||
repetition_penalty:float = 1.35 | ||
|
||
|
@@ -211,7 +213,38 @@ def pack_audio(io_buffer:BytesIO, data:np.ndarray, rate:int, media_type:str): | |
io_buffer.seek(0) | ||
return io_buffer | ||
|
||
|
||
def pack_srt(srt:List, fmt:str): | ||
if fmt == "raw": | ||
return srt | ||
# TODO: support formats like "srt", "lrc", "vtt", ... | ||
return srt | ||
|
||
def load_base64_audio(audio): | ||
import base64 | ||
if isinstance(audio, (bytes, bytearray)): | ||
audio = bytes(audio) | ||
elif hasattr(audio, 'read'): # file-like obj | ||
audio = audio.read() | ||
else: # path-like | ||
audio = open(audio, 'rb').read() | ||
return base64.b64encode(audio).decode('ascii') | ||
|
||
_base64_audio_cache = {} | ||
def save_base64_audio(b64str:str): | ||
import filetype, base64, uuid | ||
global _base64_audio_cache | ||
if b64str in _base64_audio_cache: | ||
return _base64_audio_cache[b64str] | ||
savedir = 'TEMP/upload' | ||
data = base64.b64decode(b64str) | ||
ft = filetype.guess(data) | ||
ext = f'.{ft.extension}' if ft else '' | ||
os.makedirs(savedir, exist_ok=True) | ||
saveto = f'{savedir}/{uuid.uuid1()}{ext}' | ||
with open(saveto, 'wb') as outf: | ||
outf.write(data) | ||
_base64_audio_cache[b64str] = saveto | ||
return saveto | ||
|
||
# from https://huggingface.co/spaces/coqui/voice-chat-with-mistral/blob/main/app.py | ||
def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=32000): | ||
|
@@ -277,7 +310,7 @@ async def tts_handle(req:dict): | |
{ | ||
"text": "", # str.(required) text to be synthesized | ||
"text_lang: "", # str.(required) language of the text to be synthesized | ||
"ref_audio_path": "", # str.(required) reference audio path | ||
"ref_audio_path": "", # str.(required) reference audio path ; allow data of format base64:xxxxxx | ||
"aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker synthesis | ||
"prompt_text": "", # str.(optional) prompt text for the reference audio | ||
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio | ||
|
@@ -293,6 +326,7 @@ async def tts_handle(req:dict): | |
"seed": -1, # int. random seed for reproducibility. | ||
"media_type": "wav", # str. media type of the output audio, support "wav", "raw", "ogg", "aac". | ||
"streaming_mode": False, # bool. whether to return a streaming response. | ||
"with_srt_format": "", # str. ""(no srt) or "raw" or "srt", "lrc", "vtt", ... formats (not implemented yet) | ||
"parallel_infer": True, # bool.(optional) whether to use parallel inference. | ||
"repetition_penalty": 1.35 # float.(optional) repetition penalty for T2S model. | ||
} | ||
|
@@ -303,14 +337,21 @@ async def tts_handle(req:dict): | |
streaming_mode = req.get("streaming_mode", False) | ||
return_fragment = req.get("return_fragment", False) | ||
media_type = req.get("media_type", "wav") | ||
with_srt_format = req.get("with_srt_format", "") | ||
ref_audio_path = req.get("ref_audio_path", "") | ||
if ref_audio_path.startswith("base64:"): | ||
req['ref_audio_path'] = ref_audio_path = save_base64_audio(ref_audio_path[len("base64:"):]) | ||
|
||
check_res = check_params(req) | ||
if check_res is not None: | ||
return check_res | ||
|
||
if streaming_mode or return_fragment: | ||
req["return_fragment"] = True | ||
|
||
|
||
if streaming_mode: with_srt_format = "" # streaming not support srt | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 流式不支持字幕时最好log输出一下,提醒用户。 |
||
req["return_with_srt"] = "orig" if with_srt_format else "" | ||
|
||
try: | ||
tts_generator=tts_pipeline.run(req) | ||
|
||
|
@@ -324,6 +365,16 @@ def streaming_generator(tts_generator:Generator, media_type:str): | |
# _media_type = f"audio/{media_type}" if not (streaming_mode and media_type in ["wav", "raw"]) else f"audio/x-{media_type}" | ||
return StreamingResponse(streaming_generator(tts_generator, media_type, ), media_type=f"audio/{media_type}") | ||
|
||
elif with_srt_format: | ||
output = [] | ||
for sr, audio_data, srt_data in tts_generator: | ||
audio_data = pack_audio(BytesIO(), audio_data, sr, media_type).getvalue() | ||
output.append({ | ||
"audio": load_base64_audio(audio_data), "media_type": f"audio/{media_type}", | ||
"srt": pack_srt(srt_data, with_srt_format), "srt_fmt": with_srt_format, | ||
}) | ||
return { "message":"succeed", "output":output } # Jsonresponse(status_code=200, content=...) | ||
|
||
else: | ||
sr, audio_data = next(tts_generator) | ||
audio_data = pack_audio(BytesIO(), audio_data, sr, media_type).getvalue() | ||
|
@@ -364,6 +415,7 @@ async def tts_get_endpoint( | |
seed:int = -1, | ||
media_type:str = "wav", | ||
streaming_mode:bool = False, | ||
with_srt_format:str = "", | ||
parallel_infer:bool = True, | ||
repetition_penalty:float = 1.35 | ||
): | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
后处理计算音频时间和恢复顺序这边,不需要返回字幕的话不去计算应该好一点,就是用单独的逻辑去控制是否需要计算。