OpenAI旗下Whisper是当下开源最牛X的语音识别转换模型,支持中文(繁体)直接秒杀微软,脸书等类似开源模型,国内的科大讯飞收费贼贵,选用large-v2做测试应用验证。

话不多说直接上把语音(Audio)转换中文文本(Text)的代码:

import torch

import datasets

from datasets import load_dataset, Audio

from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq,pipeline

import csv

import os

from zhconv import convert

result_path="result"

if not os.path.exists(result_path):

os.makedirs(result_path)

"""

processor = AutoProcessor.from_pretrained("openai/whisper-large-v2")

model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v2")

# load model and processor

processor=processor.save_pretrained("./model/whisper-large-v2")

model=model.save_pretrained("./whisper-large-v2")

"""

# 直接本地文件

audio_dataset = datasets.Dataset.from_dict(

{"audio": [

#"https://***/v_f1010.mp3",

"/Users/scripts/video/train/347_1676944773.mp3",

"/Users/scripts/video/valication/1272-128104-0000.flac"]

}

).cast_column("audio", Audio())

print(audio_dataset)

sample=audio_dataset[0]["audio"] #取第一行

"""

audio_dataset = load_dataset("audiofolder",data_files=[

"https://****/v_f1010.mp3",

"https://****/v_f1011.mp3"

])

print(audio_dataset)

sample=audio_dataset["train"][0]["audio"]

print(sample)

"""

device = "cuda:0" if torch.cuda.is_available() else "cpu"

pipe = pipeline(

"automatic-speech-recognition",

model="./model/whisper-large-v2",

chunk_length_s=30,

device=device,

)

prediction = pipe(sample.copy())["text"]

print("chunk:",prediction)

#prediction = pipe(sample, return_timestamps=True)["chunks"]

#print("timestamp:",prediction)

print(convert(prediction,'zh-cn'))

运行结果如下:

虽说诗词的准确度虽说不是100%,但是经过专门finetune后肯定效果更好。

后面会继续分享视频转文本(video-to-text)相关内容。