diff --git a/egs/zipvoice/local/prepare_token_file_emilia.py b/egs/zipvoice/local/prepare_token_file_emilia.py
index 65aa302..f8fd6c5 100755
--- a/egs/zipvoice/local/prepare_token_file_emilia.py
+++ b/egs/zipvoice/local/prepare_token_file_emilia.py
@@ -52,7 +52,7 @@ def get_args():
def get_pinyin_tokens(pinyin: Path) -> List[str]:
phones = set()
- with open(pinyin, "r") as f:
+ with open(pinyin, "r", encoding="utf-8") as f:
for line in f:
x = line.strip()
initial = to_initials(x, strict=False)
diff --git a/egs/zipvoice_dialog/local/prepare_opendialog.py b/egs/zipvoice_dialog/local/prepare_opendialog.py
index 4934dc3..b25dd3f 100644
--- a/egs/zipvoice_dialog/local/prepare_opendialog.py
+++ b/egs/zipvoice_dialog/local/prepare_opendialog.py
@@ -155,7 +155,7 @@ def prepare_subset(
logging.info(f"Reading {jsonl_path}")
recordings_path_set = set()
supervision_list = list()
- with open(jsonl_path, "r") as fr:
+ with open(jsonl_path, "r", encoding="utf-8") as fr:
for line in fr:
try:
items = json.loads(line)
diff --git a/requirements-webui.txt b/requirements-webui.txt
new file mode 100644
index 0000000..da0ab4e
--- /dev/null
+++ b/requirements-webui.txt
@@ -0,0 +1 @@
+gradio
\ No newline at end of file
diff --git a/webui_zh.py b/webui_zh.py
new file mode 100644
index 0000000..e22067d
--- /dev/null
+++ b/webui_zh.py
@@ -0,0 +1,470 @@
+# webui_zh
+# Created by @ByronLeeeee
+
+import gradio as gr
+import subprocess
+import os
+import time
+import torch
+import sys
+import logging
+import pandas as pd
+import shutil
+import zipfile
+
+# --- 全局设置 ---
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# 确保输出目录存在
+os.makedirs("outputs", exist_ok=True)
+
+os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
+
+# --- 设备信息检查 ---
+def get_device_info():
+ """检查并返回当前 Torch 使用的设备信息。"""
+ if torch.cuda.is_available():
+ gpu_name = torch.cuda.get_device_name(0)
+ pytorch_cuda_version = torch.version.cuda
+ return f"""
+
+ ✅ CUDA 可用! 正在使用 GPU 加速。
+ GPU: {gpu_name} | PyTorch CUDA: {pytorch_cuda_version}
+
+ """
+ else:
+ return """
+
+ ⚠️ CUDA 不可用! 程序将运行在 CPU 上,建议使用ONNX模式进行推理。
+
+ """
+
+# --- 核心命令行执行函数 ---
+def run_command(command, progress_desc="正在合成..."):
+ """执行命令行命令并记录输出。"""
+ logging.info(f"执行命令: {' '.join(command)}")
+ try:
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding='utf-8')
+ stdout, stderr = process.communicate()
+
+ if process.returncode != 0:
+ logging.error(f"命令执行失败! 返回码: {process.returncode}")
+ logging.error(f"标准输出:\n{stdout}")
+ logging.error(f"标准错误:\n{stderr}")
+ raise gr.Error(f"后端脚本执行失败: {stderr[:1000]}")
+ else:
+ logging.info(f"脚本标准输出:\n{stdout}")
+ if stderr:
+ logging.warning(f"脚本标准错误输出:\n{stderr}")
+
+ except FileNotFoundError:
+ logging.error(f"命令未找到: {command[0]}")
+ raise gr.Error(f"无法执行命令。请确保您在正确的 ZipVoice 环境和目录中运行此脚本。")
+ except Exception as e:
+ logging.error(f"执行命令时发生未知错误: {e}")
+ raise gr.Error(f"发生未知错误: {str(e)}")
+
+
+# --- 各个标签页的后端函数 ---
+
+def inference_single_speaker_cli(
+ model_name, prompt_audio_path, prompt_text, target_text,
+ guidance_scale, num_step, speed, progress=gr.Progress()
+):
+ if not all([prompt_audio_path, prompt_text, target_text]):
+ raise gr.Error("请提供所有输入:参考音频、参考音频文本和目标文本。")
+ progress(0.1, desc="准备参数...")
+ output_filename = f"outputs/output_single_{int(time.time())}.wav"
+ python_executable = sys.executable
+ command = [
+ python_executable, "-m", "zipvoice.bin.infer_zipvoice",
+ "--model-name", model_name,
+ "--prompt-wav", prompt_audio_path,
+ "--prompt-text", prompt_text,
+ "--text", target_text,
+ "--res-wav-path", output_filename,
+ "--guidance-scale", str(guidance_scale),
+ "--num-step", str(int(num_step)),
+ "--speed", str(speed)
+ ]
+ progress(0.5, desc="正在合成...")
+ run_command(command)
+ progress(1.0, desc="完成!")
+ return output_filename
+
+def inference_onnx_cli(
+ model_name, use_int8, prompt_audio_path, prompt_text, target_text,
+ guidance_scale, num_step, speed, progress=gr.Progress()
+):
+ if not all([prompt_audio_path, prompt_text, target_text]):
+ raise gr.Error("请提供所有输入:参考音频、参考音频文本和目标文本。")
+ progress(0.1, desc="准备参数...")
+ output_filename = f"outputs/output_onnx_{int(time.time())}.wav"
+ python_executable = sys.executable
+ command = [
+ python_executable, "-m", "zipvoice.bin.infer_zipvoice_onnx",
+ "--model-name", model_name,
+ "--onnx-int8", str(use_int8),
+ "--prompt-wav", prompt_audio_path,
+ "--prompt-text", prompt_text,
+ "--text", target_text,
+ "--res-wav-path", output_filename,
+ "--guidance-scale", str(guidance_scale),
+ "--num-step", str(int(num_step)),
+ "--speed", str(speed)
+ ]
+ progress(0.5, desc="正在合成 (ONNX)...")
+ run_command(command)
+ progress(1.0, desc="完成!")
+ return output_filename
+
+def inference_dialogue_cli(
+ model_name, prompt_type,
+ merged_prompt_audio_path, merged_prompt_text,
+ spk1_prompt_audio_path, spk1_prompt_text,
+ spk2_prompt_audio_path, spk2_prompt_text,
+ dialogue_text,
+ guidance_scale, num_step, speed, progress=gr.Progress()
+):
+ if not dialogue_text:
+ raise gr.Error("请输入要合成的对话文本。")
+ progress(0.1, desc="创建临时任务文件...")
+
+ temp_tsv_filename = f"temp_dialog_list_{int(time.time())}.tsv"
+ output_wav_name = f"dialogue_{int(time.time())}"
+ output_dir = "outputs"
+ output_filename = os.path.join(output_dir, f"{output_wav_name}.wav")
+
+ try:
+ with open(temp_tsv_filename, "w", encoding="utf-8") as f:
+ if prompt_type == "合并的Prompt":
+ if not all([merged_prompt_audio_path, merged_prompt_text]):
+ raise gr.Error("请提供合并的参考音频和文本。")
+ line = f"{output_wav_name}\t{merged_prompt_text}\t{merged_prompt_audio_path}\t{dialogue_text}"
+ f.write(line)
+ else:
+ if not all([spk1_prompt_audio_path, spk1_prompt_text, spk2_prompt_audio_path, spk2_prompt_text]):
+ raise gr.Error("请为两位说话人提供完整的参考音频和文本。")
+ line = f"{output_wav_name}\t{spk1_prompt_text}\t{spk2_prompt_text}\t{spk1_prompt_audio_path}\t{spk2_prompt_audio_path}\t{dialogue_text}"
+ f.write(line)
+
+ python_executable = sys.executable
+ command = [
+ python_executable, "-m", "zipvoice.bin.infer_zipvoice_dialog",
+ "--model-name", model_name,
+ "--test-list", temp_tsv_filename,
+ "--res-dir", output_dir,
+ "--guidance-scale", str(guidance_scale),
+ "--num-step", str(int(num_step)),
+ "--speed", str(speed)
+ ]
+
+ progress(0.5, desc="正在合成对话...")
+ run_command(command)
+
+ finally:
+ if os.path.exists(temp_tsv_filename):
+ os.remove(temp_tsv_filename)
+
+ progress(1.0, desc="完成!")
+ return output_filename
+
+def inference_batch_cli(
+ task_type, model_name, tsv_file, dataframe, progress=gr.Progress()
+):
+ if tsv_file is None and (dataframe is None or dataframe.empty):
+ raise gr.Error("请上传一个 TSV 文件或在编辑器中创建数据。")
+
+ progress(0.1, desc="准备批量任务...")
+
+ temp_tsv_filename = f"temp_batch_list_{int(time.time())}.tsv"
+
+ if tsv_file is not None:
+ # 如果上传了文件,使用它
+ shutil.copy(tsv_file.name, temp_tsv_filename)
+ else:
+ # 否则,使用 DataFrame 的内容
+ dataframe.to_csv(temp_tsv_filename, sep='\t', header=False, index=False)
+
+ batch_id = f"batch_{int(time.time())}"
+ output_dir = os.path.join("outputs", batch_id)
+ os.makedirs(output_dir, exist_ok=True)
+
+ python_executable = sys.executable
+
+ if task_type == "单人语音":
+ script_name = "zipvoice.bin.infer_zipvoice"
+ else: # 对话
+ script_name = "zipvoice.bin.infer_zipvoice_dialog"
+
+ command = [
+ python_executable, "-m", script_name,
+ "--model-name", model_name,
+ "--test-list", temp_tsv_filename,
+ "--res-dir", output_dir
+ ]
+
+ try:
+ total_lines = sum(1 for line in open(temp_tsv_filename, 'r', encoding='utf-8'))
+ progress(0.5, desc=f"正在处理 {total_lines} 条音频...")
+ run_command(command, progress_desc=f"正在处理 {total_lines} 条音频...")
+
+ progress(0.9, desc="正在打包结果...")
+ zip_path = os.path.join("outputs", f"{batch_id}_results.zip")
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
+ for root, _, files in os.walk(output_dir):
+ for file in files:
+ zipf.write(os.path.join(root, file), arcname=file)
+
+ finally:
+ if os.path.exists(temp_tsv_filename):
+ os.remove(temp_tsv_filename)
+ if os.path.exists(output_dir):
+ shutil.rmtree(output_dir) # 删除临时文件夹
+
+ progress(1.0, desc="批量处理完成!")
+ return gr.update(value=zip_path, visible=True)
+
+# --- UI 辅助函数 ---
+def update_single_speaker_defaults(model_name):
+ """根据单人 TTS 模型名称更新高级参数的默认值。"""
+ if model_name == "zipvoice":
+ return gr.update(value=1.0), gr.update(value=16)
+ elif model_name == "zipvoice_distill":
+ return gr.update(value=3.0), gr.update(value=8)
+ # 默认回退
+ return gr.update(), gr.update()
+
+# --- Gradio UI 界面定义 ---
+with gr.Blocks(theme=gr.themes.Soft(), title="ZipVoice WebUI") as app:
+ gr.Markdown("# ⚡ ZipVoice 语音合成 WebUI")
+ gr.Markdown("这是一个基于 [k2-fsa/ZipVoice](https://github.com/k2-fsa/ZipVoice) 项目的 WebUI。")
+ gr.Markdown(value=get_device_info())
+
+ with gr.Tabs():
+ # --- 单人语音合成 (PyTorch) ---
+ with gr.TabItem("1. 单人语音合成 (PyTorch)"):
+ with gr.Row():
+ with gr.Column(scale=2):
+ gr.Markdown("### 输入")
+ model_name_single = gr.Dropdown(["zipvoice_distill", "zipvoice"], value="zipvoice_distill", label="模型")
+ prompt_audio_single = gr.Audio(label="参考音频", type="filepath", sources=["upload", "microphone"])
+ prompt_text_single = gr.Textbox(label="参考音频的文本", placeholder="输入参考音频对应的文本...")
+ target_text_single = gr.Textbox(label="目标文本", placeholder="输入想要合成的文本...", lines=3)
+ with gr.Accordion("高级设置", open=False):
+ guidance_scale_single = gr.Slider(minimum=0.5, maximum=5.0, value=3.0, step=0.1, label="引导系数")
+ num_step_single = gr.Slider(minimum=2, maximum=20, value=8, step=1, label="步数")
+ speed_single = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="语速")
+ submit_btn_single = gr.Button("合成语音", variant="primary")
+ with gr.Column(scale=1):
+ gr.Markdown("### 输出")
+ output_audio_single = gr.Audio(label="生成的音频")
+
+ # --- 单人语音合成 (ONNX) ---
+ with gr.TabItem("2. 单人语音合成 (ONNX CPU)"):
+ with gr.Row():
+ with gr.Column(scale=2):
+ gr.Markdown("### 输入 (ONNX)")
+ model_name_onnx = gr.Dropdown(["zipvoice_distill", "zipvoice"], value="zipvoice_distill", label="模型")
+ use_int8_onnx = gr.Checkbox(label="使用 INT8 量化模型 (更快)", value=False)
+ prompt_audio_onnx = gr.Audio(label="参考音频", type="filepath", sources=["upload", "microphone"])
+ prompt_text_onnx = gr.Textbox(label="参考音频的文本", placeholder="输入参考音频对应的文本...")
+ target_text_onnx = gr.Textbox(label="目标文本", placeholder="输入想要合成的文本...", lines=3)
+ with gr.Accordion("高级设置", open=False):
+ guidance_scale_onnx = gr.Slider(minimum=0.5, maximum=5.0, value=3.0, step=0.1, label="引导系数")
+ num_step_onnx = gr.Slider(minimum=2, maximum=20, value=8, step=1, label="步数")
+ speed_onnx = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="语速")
+ submit_btn_onnx = gr.Button("合成语音 (ONNX)", variant="primary")
+ with gr.Column(scale=1):
+ gr.Markdown("### 输出")
+ output_audio_onnx = gr.Audio(label="生成的音频")
+
+ # --- 对话语音合成 ---
+ with gr.TabItem("3. 对话语音合成"):
+ with gr.Row():
+ with gr.Column(scale=2):
+ gr.Markdown("### 输入")
+ model_name_dialogue = gr.Dropdown(["zipvoice_dialog", "zipvoice_dialog_stereo"], value="zipvoice_dialog", label="模型")
+ prompt_type = gr.Radio(["合并的Prompt", "分离的Prompt"], label="参考音频类型", value="分离的Prompt")
+ with gr.Group(visible=False) as merged_prompt_group:
+ merged_prompt_audio = gr.Audio(label="参考音频 (合并)", type="filepath", sources=["upload"])
+ merged_prompt_text = gr.Textbox(label="参考音频文本 (合并)", placeholder="例如: [S1] 你好。[S2] 你好呀。")
+ with gr.Group(visible=True) as splitted_prompt_group:
+ with gr.Row():
+ with gr.Column():
+ spk1_prompt_audio = gr.Audio(label="说话人1 参考音频", type="filepath", sources=["upload"])
+ spk1_prompt_text = gr.Textbox(label="说话人1 参考文本")
+ with gr.Column():
+ spk2_prompt_audio = gr.Audio(label="说话人2 参考音频", type="filepath", sources=["upload"])
+ spk2_prompt_text = gr.Textbox(label="说话人2 参考文本")
+ dialogue_text = gr.Textbox(label="要合成的对话文本", placeholder="例如: [S1] 我很好,你呢?[S2] 我也很好。", lines=4)
+ with gr.Accordion("高级设置", open=False):
+ guidance_scale_dialogue = gr.Slider(minimum=0.5, maximum=5.0, value=1.5, step=0.1, label="引导系数")
+ num_step_dialogue = gr.Slider(minimum=2, maximum=20, value=16, step=1, label="步数")
+ speed_dialogue = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="语速")
+ submit_btn_dialogue = gr.Button("生成对话", variant="primary")
+ with gr.Column(scale=1):
+ gr.Markdown("### 输出")
+ output_audio_dialogue = gr.Audio(label="生成的对话音频")
+
+ # --- 批量推理 ---
+ with gr.TabItem("4. 批量推理 (TSV)"):
+ gr.Markdown("在此处创建或上传TSV文件进行批量推理。处理完成后,结果将打包为 ZIP 文件供下载。")
+ with gr.Row():
+ with gr.Column(scale=2):
+ batch_task_type = gr.Radio(["单人语音", "对话"], label="任务类型", value="单人语音")
+ batch_model_name = gr.Dropdown(
+ ["zipvoice_distill", "zipvoice", "zipvoice_dialog", "zipvoice_dialog_stereo"],
+ value="zipvoice_distill",
+ label="模型"
+ )
+
+ gr.Markdown("#### 编辑或上传您的 TSV 文件")
+ upload_btn = gr.UploadButton("上传 TSV 文件", file_types=[".tsv"])
+
+ df_single_headers = ["wav_name", "prompt_transcription", "prompt_wav", "text"]
+ df_dialogue_headers = ["wav_name", "spk1_prompt_transcription", "spk2_prompt_transcription", "spk1_prompt_wav", "spk2_prompt_wav", "text"]
+
+ dataframe_editor = gr.DataFrame(
+ headers=df_single_headers,
+ datatype=["str"] * len(df_single_headers),
+ row_count=(2, "dynamic"),
+ col_count=(len(df_single_headers), "fixed"),
+ label="TSV 编辑器",
+ wrap=True
+ )
+
+ submit_btn_batch = gr.Button("开始批量推理", variant="primary")
+
+ with gr.Column(scale=1):
+ gr.Markdown("### 输出")
+ batch_output_file = gr.File(label="下载结果 (ZIP)", visible=False)
+ gr.Markdown(
+ """
+ **TSV 格式说明:**
+
+ **单人语音:**
+ `输出文件名\t参考文本\t参考音频路径\t目标文本`
+
+ **对话 (分离Prompt):**
+ `输出文件名\t说话人1文本\t说话人2文本\t说话人1音频路径\t说话人2音频路径\t对话文本`
+
+ *注意: 音频路径应为本地绝对路径或相对于此脚本的相对路径。*
+ """
+ )
+
+ # --- 使用说明 ---
+ with gr.TabItem("5. 使用说明"):
+ gr.Markdown("""
+ ## 如何使用 ZipVoice WebUI
+
+ ### 1. 单人语音合成
+ - **功能**: 克隆一个人的声音并用其朗读新的文本。
+ - **步骤**:
+ 1. 在 **"1. 单人语音合成"** 标签页中,选择一个模型 (`zipvoice_distill` 速度更快)。
+ 2. 上传一段 **参考音频** (Prompt Audio),时长建议在 3-10 秒。
+ 3. 在 **参考音频的文本** 框中,准确输入该参考音频对应的文字。
+ 4. 在 **目标文本** 框中,输入您希望模型朗读的新内容。
+ 5. 点击 **合成语音**。
+
+ ### 2. 对话语音合成
+ - **功能**: 生成包含两个不同说话人的对话。
+ - **步骤**:
+ 1. 在 **"3. 对话语音合成"** 标签页中,选择模型 (`zipvoice_dialog_stereo` 会生成双声道音频)。
+ 2. 选择 **参考音频类型**:
+ - **分离的Prompt (推荐)**: 分别上传两位说话人的独立音频和对应文本。
+ - **合并的Prompt**: 上传一个包含两人对话的音频,并在文本框中用 `[S1]` 和 `[S2]` 标注。
+ 3. 在 **要合成的对话文本** 框中,输入完整的对话内容,并使用 `[S1]` 和 `[S2]` 来区分不同说话人的轮次。
+ - **示例**: `[S1] 你好吗?[S2] 我很好,谢谢。[S1] 不客气。`
+ 4. 点击 **生成对话**。
+
+ ### 3. ONNX CPU 推理
+ - **功能**: 与单人语音合成相同,但使用 ONNX 模型在 CPU 上运行,通常比 PyTorch 在 CPU 上的推理速度更快。
+ - **步骤**:
+ 1. 在 **"2. 单人语音合成 (ONNX CPU)"** 标签页中操作。
+ 2. 勾选 **使用 INT8 量化模型** 可以获得更快的速度,但可能会牺牲一点点质量。
+ 3. 其余步骤与单人语音合成完全相同。
+
+ ### 4. 批量推理
+ - **功能**: 一次性处理多个合成任务。
+ - **步骤**:
+ 1. 在 **"4. 批量推理 (TSV)"** 标签页中,首先选择 **任务类型** (单人或对话) 和模型。
+ 2. **创建数据**: 在 **TSV 编辑器** 中按照格式说明手动输入多行任务。
+ 3. **或上传数据**: 点击 **上传 TSV 文件** 按钮,选择一个本地的制表符分隔文件。
+ 4. 点击 **开始批量推理**。任务完成后,右侧会提供一个包含所有生成音频的 ZIP 文件供下载。
+
+ ### 纠正中文多音字发音
+ 当遇到中文多音字发音错误时,您可以通过 pinyin 手动指定。
+ - **格式**: `这把剑三十公分`
+ - **说明**: 用尖括号 `< >` 包围正确的拼音,并在末尾加上声调数字 (1-4 为四声,5 为轻声)。
+ """)
+
+ # --- 事件处理逻辑 ---
+ submit_btn_single.click(
+ fn=inference_single_speaker_cli,
+ inputs=[model_name_single, prompt_audio_single, prompt_text_single, target_text_single, guidance_scale_single, num_step_single, speed_single],
+ outputs=output_audio_single
+ )
+ submit_btn_onnx.click(
+ fn=inference_onnx_cli,
+ inputs=[model_name_onnx, use_int8_onnx, prompt_audio_onnx, prompt_text_onnx, target_text_onnx, guidance_scale_onnx, num_step_onnx, speed_onnx],
+ outputs=output_audio_onnx
+ )
+ submit_btn_dialogue.click(
+ fn=inference_dialogue_cli,
+ inputs=[model_name_dialogue, prompt_type, merged_prompt_audio, merged_prompt_text, spk1_prompt_audio, spk1_prompt_text, spk2_prompt_audio, spk2_prompt_text, dialogue_text, guidance_scale_dialogue, num_step_dialogue, speed_dialogue],
+ outputs=output_audio_dialogue
+ )
+
+ model_name_single.change(
+ fn=update_single_speaker_defaults,
+ inputs=model_name_single,
+ outputs=[guidance_scale_single, num_step_single]
+ )
+ model_name_onnx.change(
+ fn=update_single_speaker_defaults,
+ inputs=model_name_onnx,
+ outputs=[guidance_scale_onnx, num_step_onnx]
+ )
+
+ def toggle_prompt_type(choice):
+ return gr.update(visible=choice == "合并的Prompt"), gr.update(visible=choice == "分离的Prompt")
+
+ prompt_type.change(fn=toggle_prompt_type, inputs=prompt_type, outputs=[merged_prompt_group, splitted_prompt_group])
+
+ def update_dataframe(task_type):
+ if task_type == "单人语音":
+ headers = df_single_headers
+ else: # 对话
+ headers = df_dialogue_headers
+ return gr.update(headers=headers, col_count=(len(headers), "fixed"), value=None)
+
+ batch_task_type.change(fn=update_dataframe, inputs=batch_task_type, outputs=dataframe_editor)
+
+ def upload_file_to_df(file, task_type):
+ if file is None:
+ return None
+ try:
+ df = pd.read_csv(file.name, sep='\t', header=None)
+ if task_type == "单人语音":
+ expected_cols = 4
+ else: # 对话
+ expected_cols = 6
+
+ if df.shape[1] != expected_cols:
+ raise gr.Error(f"TSV 文件列数错误!'{task_type}' 任务需要 {expected_cols} 列,但文件有 {df.shape[1]} 列。")
+
+ return gr.update(value=df)
+ except Exception as e:
+ raise gr.Error(f"读取或解析 TSV 文件失败: {e}")
+
+ upload_btn.upload(fn=upload_file_to_df, inputs=[upload_btn, batch_task_type], outputs=dataframe_editor)
+
+ submit_btn_batch.click(
+ fn=inference_batch_cli,
+ inputs=[batch_task_type, batch_model_name, upload_btn, dataframe_editor],
+ outputs=batch_output_file
+ )
+
+if __name__ == "__main__":
+ app.launch()
\ No newline at end of file
diff --git a/zipvoice/bin/generate_averaged_model.py b/zipvoice/bin/generate_averaged_model.py
index 7ff432b..05172f8 100644
--- a/zipvoice/bin/generate_averaged_model.py
+++ b/zipvoice/bin/generate_averaged_model.py
@@ -111,7 +111,7 @@ def main():
params.update(vars(args))
params.exp_dir = Path(params.exp_dir)
- with open(params.exp_dir / "model.json", "r") as f:
+ with open(params.exp_dir / "model.json", "r", encoding="utf-8") as f:
model_config = json.load(f)
# Any tokenizer can be used here.
diff --git a/zipvoice/bin/infer_zipvoice.py b/zipvoice/bin/infer_zipvoice.py
index ffd11b8..16b3688 100644
--- a/zipvoice/bin/infer_zipvoice.py
+++ b/zipvoice/bin/infer_zipvoice.py
@@ -646,7 +646,7 @@ def generate_list(
total_t_vocoder = []
total_wav_seconds = []
- with open(test_list, "r") as fr:
+ with open(test_list, "r", encoding="utf-8") as fr:
lines = fr.readlines()
for i, line in enumerate(lines):
@@ -774,7 +774,7 @@ def main():
tokenizer_config = {"vocab_size": tokenizer.vocab_size, "pad_id": tokenizer.pad_id}
- with open(model_config, "r") as f:
+ with open(model_config, "r", encoding="utf-8") as f:
model_config = json.load(f)
if params.model_name == "zipvoice":
diff --git a/zipvoice/bin/infer_zipvoice_dialog.py b/zipvoice/bin/infer_zipvoice_dialog.py
index 9503408..577ca22 100644
--- a/zipvoice/bin/infer_zipvoice_dialog.py
+++ b/zipvoice/bin/infer_zipvoice_dialog.py
@@ -1066,7 +1066,7 @@ def generate_list(
total_t_vocoder = []
total_wav_seconds = []
- with open(test_list, "r") as fr:
+ with open(test_list, "r", encoding="utf-8") as fr:
lines = fr.readlines()
for i, line in enumerate(lines):
@@ -1202,7 +1202,7 @@ def main():
"spk_b_id": tokenizer.spk_b_id,
}
- with open(model_config, "r") as f:
+ with open(model_config, "r", encoding="utf-8") as f:
model_config = json.load(f)
if params.model_name == "zipvoice_dialog":
diff --git a/zipvoice/bin/infer_zipvoice_onnx.py b/zipvoice/bin/infer_zipvoice_onnx.py
index 6852535..99ca54e 100644
--- a/zipvoice/bin/infer_zipvoice_onnx.py
+++ b/zipvoice/bin/infer_zipvoice_onnx.py
@@ -715,7 +715,7 @@ def generate_list(
total_t_vocoder = []
total_wav_seconds = []
- with open(test_list, "r") as fr:
+ with open(test_list, "r", encoding="utf-8") as fr:
lines = fr.readlines()
for i, line in enumerate(lines):
@@ -855,7 +855,7 @@ def main():
assert params.tokenizer == "simple"
tokenizer = SimpleTokenizer(token_file=token_file)
- with open(model_config, "r") as f:
+ with open(model_config, "r", encoding="utf-8") as f:
model_config = json.load(f)
model = OnnxModel(text_encoder_path, fm_decoder_path, num_thread=args.num_thread)
diff --git a/zipvoice/bin/onnx_export.py b/zipvoice/bin/onnx_export.py
index 64b5aae..8217fd8 100644
--- a/zipvoice/bin/onnx_export.py
+++ b/zipvoice/bin/onnx_export.py
@@ -349,7 +349,7 @@ def main():
tokenizer = SimpleTokenizer(token_file)
tokenizer_config = {"vocab_size": tokenizer.vocab_size, "pad_id": tokenizer.pad_id}
- with open(model_config, "r") as f:
+ with open(model_config, "r", encoding="utf-8") as f:
model_config = json.load(f)
if params.model_name == "zipvoice":
diff --git a/zipvoice/bin/prepare_dataset.py b/zipvoice/bin/prepare_dataset.py
index e3a2b28..fcb52b9 100644
--- a/zipvoice/bin/prepare_dataset.py
+++ b/zipvoice/bin/prepare_dataset.py
@@ -193,7 +193,7 @@ def prepare_dataset(
# Step 1: Read all unique recording paths
recordings_path_set = set()
supervision_list = list()
- with open(tsv_path, "r") as fr:
+ with open(tsv_path, "r", encoding="utf-8") as fr:
for line in fr:
items = line.strip().split("\t")
if len(items) == 3:
diff --git a/zipvoice/bin/train_zipvoice.py b/zipvoice/bin/train_zipvoice.py
index 59d3ace..981ae07 100644
--- a/zipvoice/bin/train_zipvoice.py
+++ b/zipvoice/bin/train_zipvoice.py
@@ -875,7 +875,7 @@ def run(rank, world_size, args):
# Set epoch to a large number to ignore it.
if params.num_iters > 0:
params.num_epochs = 1000000
- with open(params.model_config, "r") as f:
+ with open(params.model_config, "r", encoding="utf-8") as f:
model_config = json.load(f)
params.update(model_config["model"])
params.update(model_config["feature"])
diff --git a/zipvoice/bin/train_zipvoice_dialog.py b/zipvoice/bin/train_zipvoice_dialog.py
index c401220..84225e7 100644
--- a/zipvoice/bin/train_zipvoice_dialog.py
+++ b/zipvoice/bin/train_zipvoice_dialog.py
@@ -723,7 +723,7 @@ def run(rank, world_size, args):
# Set epoch to a large number to ignore it.
if params.num_iters > 0:
params.num_epochs = 1000000
- with open(params.model_config, "r") as f:
+ with open(params.model_config, "r", encoding="utf-8") as f:
model_config = json.load(f)
params.update(model_config["model"])
params.update(model_config["feature"])
diff --git a/zipvoice/bin/train_zipvoice_dialog_stereo.py b/zipvoice/bin/train_zipvoice_dialog_stereo.py
index 12ce915..7424503 100644
--- a/zipvoice/bin/train_zipvoice_dialog_stereo.py
+++ b/zipvoice/bin/train_zipvoice_dialog_stereo.py
@@ -728,7 +728,7 @@ def run(rank, world_size, args):
# Set epoch to a large number to ignore it.
if params.num_iters > 0:
params.num_epochs = 1000000
- with open(params.model_config, "r") as f:
+ with open(params.model_config, "r", encoding="utf-8") as f:
model_config = json.load(f)
params.update(model_config["model"])
params.update(model_config["feature"])
diff --git a/zipvoice/bin/train_zipvoice_distill.py b/zipvoice/bin/train_zipvoice_distill.py
index 0a6fc9f..5ddbf8f 100644
--- a/zipvoice/bin/train_zipvoice_distill.py
+++ b/zipvoice/bin/train_zipvoice_distill.py
@@ -866,7 +866,7 @@ def run(rank, world_size, args):
# Set epoch to a large number to ignore it.
if params.num_iters > 0:
params.num_epochs = 1000000
- with open(params.model_config, "r") as f:
+ with open(params.model_config, "r", encoding="utf-8") as f:
model_config = json.load(f)
params.update(model_config["model"])
params.update(model_config["feature"])