sensevoice

LauraGPT · LauraGPT · commit a4418dca630b · 2024-07-16T14:16:17.000+08:00
diff --git a/README.md b/README.md
@@ -95,43 +95,49 @@ pip install -r requirements.txt
 
 ## Inference
 
-
-### Method 2
+Supports input of audio in any format and of any duration.
 
 ```python
 from funasr import AutoModel
 from funasr.utils.postprocess_utils import rich_transcription_postprocess
 
 model_dir = "iic/SenseVoiceSmall"
-input_file = (
-    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
-)
 
-model = AutoModel(model=model_dir,
-                  vad_model="fsmn-vad",
-                  vad_kwargs={"max_single_segment_time": 30000},
-                  trust_remote_code=True, device="cuda:0")
 
+model = AutoModel(
+    model=model_dir,
+    vad_model="fsmn-vad",
+    vad_kwargs={"max_single_segment_time": 30000},
+    device="cpu",
+)
+
+# en
 res = model.generate(
-    input=input_file,
+    input=f"{model.model_path}/example/en.mp3",
     cache={},
-    language="zh", # "zn", "en", "yue", "ja", "ko", "nospeech"
-    use_itn=False,
-    batch_size_s=0, 
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
 )
-
 text = rich_transcription_postprocess(res[0]["text"])
-
 print(text)
 ```
 
-The funasr version has integrated the VAD (Voice Activity Detection) model and supports audio input of any duration, with `batch_size_s` in seconds.
-If all inputs are short audios, and batch inference is needed to speed up inference efficiency, the VAD model can be removed, and `batch_size` can be set accordingly.
+Parameter Descriptions:
+- `model_dir`: The name of the model, or the model's path on the local disk.
+- `max_single_segment_time`: The maximum length of audio segments that the `vad_model` can cut, measured in milliseconds (ms).
+- `use_itn`: Indicates whether the output should include punctuation and inverse text normalization.
+- `batch_size_s`: Represents a dynamic batch size where the total duration of the audio in the batch is measured in seconds (s).
+- `merge_vad`: Whether to concatenate short audio fragments cut by the vad model, with the merged length being `merge_length_s`, measured in seconds (s).
+
+If all inputs are short audios (<30s), and batch inference is needed to speed up inference efficiency, the VAD model can be removed, and `batch_size` can be set accordingly.
 ```python
 model = AutoModel(model=model_dir, trust_remote_code=True, device="cuda:0")
 
 res = model.generate(
-    input=input_file,
+    input=f"{model.model_path}/example/en.mp3",
     cache={},
     language="zh", # "zn", "en", "yue", "ja", "ko", "nospeech"
     use_itn=False,
@@ -141,23 +147,27 @@ res = model.generate(
 
 For more usage, please refer to [docs](https://github.com/modelscope/FunASR/blob/main/docs/tutorial/README.md)
 
-### Method 1
+### Inference directly
+
+Supports input of audio in any format, with an input duration limit of 30 seconds or less.
 
 ```python
 from model import SenseVoiceSmall
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
 
 model_dir = "iic/SenseVoiceSmall"
 m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir)
 
 
 res = m.inference(
     data_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
-    language="zh", # "zn", "en", "yue", "ja", "ko", "nospeech"
+    language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
     use_itn=False,
     **kwargs,
 )
 
-print(res)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
 ```
 
 ### Export and Test
diff --git a/README_zh.md b/README_zh.md
@@ -128,18 +128,23 @@ res = model.generate(
 text = rich_transcription_postprocess(res[0]["text"])
 print(text)
 ```
+参数说明：
+- `model_dir`：模型名称，或本地磁盘中的模型路径。
+- `max_single_segment_time`: 表示`vad_model`最大切割音频时长, 单位是毫秒ms。
+- `use_itn`：输出结果中是否包含标点与逆文本正则化。
+- `batch_size_s` 表示采用动态batch，batch中总音频时长，单位为秒s。
+- `merge_vad`：是否将 vad 模型切割的短音频碎片合成，合并后长度为`merge_length_s`，单位为秒s。
 
-funasr版本已经集成了vad模型，支持任意时长音频输入，`batch_size_s`单位为秒。
 如果输入均为短音频（小于30s），并且需要批量化推理，为了加快推理效率，可以移除vad模型，并设置`batch_size`
 
 ```python
 model = AutoModel(model=model_dir, trust_remote_code=True, device="cuda:0")
 
 res = model.generate(
-    input=input_file,
+    input=f"{model.model_path}/example/en.mp3",
     cache={},
     language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
-    use_itn=False,
+    use_itn=True,
     batch_size=64, 
 )
 ```
@@ -152,6 +157,7 @@ res = model.generate(
 
 ```python
 from model import SenseVoiceSmall
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
 
 model_dir = "iic/SenseVoiceSmall"
 m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir)
@@ -164,7 +170,8 @@ res = m.inference(
     **kwargs,
 )
 
-print(res)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
 ```
 
 ## 服务部署
diff --git a/demo1.py b/demo1.py
@@ -14,7 +14,7 @@
     model=model_dir,
     vad_model="fsmn-vad",
     vad_kwargs={"max_single_segment_time": 30000},
-    device="cpu",
+    device="cuda:0",
 )
 
 # en
diff --git a/demo2.py b/demo2.py
@@ -4,16 +4,19 @@
 #  MIT License  (https://opensource.org/licenses/MIT)
 
 from model import SenseVoiceSmall
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
+
 
 model_dir = "iic/SenseVoiceSmall"
 m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir)
 
 
 res = m.inference(
-    data_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
+    data_in=f"{m.model_path}/example/en.mp3",
     language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
     use_itn=False,
     **kwargs,
 )
 
-print(res)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
diff --git a/requirements.txt b/requirements.txt
@@ -3,5 +3,5 @@ torchaudio
 modelscope
 huggingface
 huggingface_hub
-funasr>=1.1.1
+funasr>=1.1.2
 numpy<=1.26.4
diff --git a/webui.py b/webui.py
@@ -168,7 +168,7 @@ def model_inference(input_wav, language, fs=16000):
 						  cache={},
 						  language=language,
 						  use_itn=True,
-						  batch_size_s=0, merge_vad=merge_vad)
+						  batch_size_s=60, merge_vad=merge_vad)
 	
 	print(text)
 	text = text[0]["text"]

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@`
`14`	`14`	`model=model_dir,`
`15`	`15`	`vad_model="fsmn-vad",`
`16`	`16`	`vad_kwargs={"max_single_segment_time": 30000},`
`17`		`- device="cpu",`
	`17`	`+ device="cuda:0",`
`18`	`18`	`)`
`19`	`19`
`20`	`20`	`# en`