tts api 部署调研 - 浥青城和yeolde

科大讯飞 IFLYTEK TTS

官网给出demo了

语音合成（流式版）WebAPI 文档 | 讯飞开放平台文档中心 (xfyun.cn)

class Ws_Param(object):
    # 初始化
    def __init__(self, APPID, APIKey, APISecret, Text, _vcn="xiaoyan"):
        self.APPID = APPID
        self.APIKey = APIKey
        self.APISecret = APISecret
        self.Text = Text

        # 公共参数(common)
        self.CommonArgs = {"app_id": self.APPID}
        # 业务参数(business)，更多个性化参数可在官网查看
        self.BusinessArgs = {"aue": "raw", "auf": "audio/L16;rate=16000", "tte": "utf8", "vcn": _vcn}
        self.Data = {"status": 2, "text": str(base64.b64encode(self.Text.encode('utf-8')), "UTF8")}
        # 使用小语种须使用以下方式，此处的unicode指的是 utf16小端的编码方式，即"UTF-16LE"”
        # self.Data = {"status": 2, "text": str(base64.b64encode(self.Text.encode('utf-16')), "UTF8")}

    # 生成url
    def create_url(self):
        url = 'wss://tts-api.xfyun.cn/v2/tts'
        # 生成RFC1123格式的时间戳
        now = datetime.now()
        date = format_date_time(mktime(now.timetuple()))

        # 拼接字符串
        signature_origin = "host: " + "ws-api.xfyun.cn" + "\n"
        signature_origin += "date: " + date + "\n"
        signature_origin += "GET " + "/v2/tts " + "HTTP/1.1"
        # 进行hmac-sha256进行加密
        signature_sha = hmac.new(self.APISecret.encode('utf-8'), signature_origin.encode('utf-8'),
                                 digestmod=hashlib.sha256).digest()
        signature_sha = base64.b64encode(signature_sha).decode(encoding='utf-8')

        authorization_origin = "api_key=\"%s\", algorithm=\"%s\", headers=\"%s\", signature=\"%s\"" % (
            self.APIKey, "hmac-sha256", "host date request-line", signature_sha)
        authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8')
        # 将请求的鉴权参数组合为字典
        v = {
            "authorization": authorization,
            "date": date,
            "host": "ws-api.xfyun.cn"
        }
        # 拼接鉴权参数，生成url
        url = url + '?' + urlencode(v)
        return url

def on_message(ws, message, output_filename='output.pcm'):
    try:
        message = json.loads(message)
        code = message["code"]
        sid = message["sid"]
        audio = message["data"]["audio"]
        audio = base64.b64decode(audio)
        status = message["data"]["status"]
        print(message)
        if status == 2:
            print("ws is closed")
            ws.close()
            pcm_to_mp3(output_filename, output_filename.replace('.pcm', '.mp3'))
        if code != 0:
            errMsg = message["message"]
            print("sid:%s call error:%s code is:%s" % (sid, errMsg, code))
        else:
            # Write audio data to the specified output file
            with open(output_filename, 'ab') as f:
                f.write(audio)

    except Exception as e:
        print("receive msg,but parse exception:", e)


# 收到websocket错误的处理
def on_error(ws, error):
    print("### error:", error)


# 收到websocket关闭的处理
def on_close(ws):
    print("### closed ###")


# 收到websocket连接建立的处理
def on_open(ws):
    def run(*args):
        d = {"common": wsParam.CommonArgs,
             "business": wsParam.BusinessArgs,
             "data": wsParam.Data,
             }
        d = json.dumps(d)
        print("------>开始发送文本数据")
        ws.send(d)
        if os.path.exists('demo.pcm'):
            os.remove('demo.pcm')

    thread.start_new_thread(run, ())

但是输出是`.pcm`的音频格式，而我需要`.mp3`的音频格式，所以需要多写一个音频格式转换的函数：

def pcm_to_mp3(pcm_file, mp3_file, channels=1, sample_rate=16000, bit_depth=16):
    """
    Convert a PCM file to MP3 format.

    :param pcm_file: Path to the input PCM file.
    :param mp3_file: Path where the output MP3 file will be saved.
    :param channels: Number of audio channels. Default is 1 (mono).
    :param sample_rate: Sample rate in Hz. Default is 16000.
    :param bit_depth: Bit depth. Default is 16.
    """
    # Read PCM file
    audio = AudioSegment(
        data=open(pcm_file, 'rb').read(),
        sample_width=bit_depth // 8,
        frame_rate=sample_rate,
        channels=channels
    )

    # Export to MP3
    audio.export(mp3_file, format="mp3")

然后就可以免费试用科大讯飞的API进行语音合成了，支持挺多音色，但是免费版本对英文应该是没有支持，就我的试用结果来看，英文是直接念英文字母。

if __name__ == "__main__":
    # 测试时候在此处正确填写相关信息即可运行
    # APPID
    # APISecret
    # APIKey
    # Text
    # vcn :(https://console.xfyun.cn/services/tts)
    #      讯飞小燕 普通话 xiaoyan
    #      讯飞许久 普通话 aisjiuxu
    #      讯飞小萍 普通话 aisxping
    #      讯飞小婧 普通话 aisjinger
    #      讯飞许小宝 普通话 aisbabyxu
    wsParam = Ws_Param(APPID='', APISecret='',
                       APIKey='',
                       Text="我们的AI视频真好看真精彩", _vcn='aisxping')

    # 输出文件名
    output_filename = 'test_aisxping.pcm'

    websocket.enableTrace(False)
    wsUrl = wsParam.create_url()
    ws = websocket.WebSocketApp(wsUrl,
                                on_message=lambda ws, msg: on_message(ws, msg, output_filename),
                                on_error=on_error,
                                on_close=on_close)
    ws.on_open = on_open
    ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})

OpenAI TTS

[openai tts 官网](platform.openai.com/docs/guides/text-to-speech)

# voice_list = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
def generate_speech(input_text, voice="onyx", folder="src/audio", file_name="default.mp3"):
    try:
        # Initialize OpenAI client
        client = OpenAI(api_key="", base_url="")
        # Construct file path for output
        speech_file_path = folder + "/" + file_name

        # Generate speech
        with client.audio.speech.with_streaming_response.create(
                model="tts-1",
                voice=voice,
                input=input_text
        ) as response:
            response.stream_to_file(speech_file_path)
            logging.info(f"Speech file saved to {speech_file_path}")

    except Exception as e:
        logging.error(f"Error occurred: {e}")

PlayHT TTS

中英文都支持，能选的声音也很多，也不算贵，是自用最好的一个，还可以自定义音色。

PlayHT

List PlayHT Voices

import requests

headers = {
    "Authorization": "",
    "accept": "text/event-stream",
    "content-type": "application/json",
    "X-USER-ID": ""
}

url = "https://api.play.ht/api/v2/tts"

payload = {
    "text": "Hello from a realistic voice.",
    "voice": "s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",
    "output_format": "mp3",
    "voice_engine": "PlayHT2.0"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)