《歌声合成系列教程1》歌声合成数据预处理
发布时间
阅读量:
阅读量
该文本描述了一个用于语音合成和音频处理的数据准备函数,核心方法基于参数化合成歌声与Merlin训练语音模型相似,涉及时长模型、语言模型和音声模型的构建与应用。函数主要包括标注转换(如从HTK到Label文件)、时长校正(通过计算offset进行调整)、数据分割以及音频特征提取(如f0、声带活动度等),并支持gain归一化操作。该方法的目标是为后续的语音合成或音乐生成任务提供高质量的标注数据和音频特征。
文章目录
-
思路介绍
-
- 核心思路
-
实现编码
思路介绍
核心思路
基于参数化合成歌声的方法类似于Merlin训练语音模型的方式,在这一过程中涉及时长模型、语言模型及音声模型等关键组件。
样本标注需求包括:
全标签(基于HTK的标注样本)、基于单声素时长标记以及从wav提取的语音特征(包含声带频率(f0)、声调(sp)和声门活动度(ap))。
* 声码器:pyworld
实现编码
# -*- coding:utf-8 -*-
# /usr/bin/python
'''
-------------------------------------------------
File Name : data_prep
Description : AIM:
Functions: 1.
2.
Functions :
Envs : python ==
pip install
Date : 2020/11/4 20:23
CodeStyle : 规范,简洁,易懂,可阅读,可维护,可移植!
-------------------------------------------------
Change Activity:
2020/11/4 : tag
-------------------------------------------------
__Author__ = "Yan Errol"
__Email__ = "2681506@gmail.com"
__Copyright__ = "Copyright 2020, Yan Errol"
-------------------------------------------------
'''
# coding: utf-8
import os
import argparse
from glob import glob
from os.path import join, basename, splitext, exists, expanduser
from nnmnkwii.io import hts
from scipy.io import wavfile
import librosa
import soundfile as sf
import sys
import numpy as np
from nnsvs.io.hts import get_note_indices
def _is_silence(l):
is_full_context = "@" in l
if is_full_context:
is_silence = ("-sil" in l or "-pau" in l)
else:
is_silence = (l == "sil" or l == "pau")
return is_silence
def remove_sil_and_pau(lab):
newlab = hts.HTSLabelFile()
for l in lab:
if "-sil" not in l[-1] and "-pau" not in l[-1]:
newlab.append(l, strict=False)
return newlab
def data_prep_function(hts_demo_root,out_dir,gain_normalize,logger):
'''函数封装'''
hts_label_root = join(hts_demo_root, "data/labels")
# Time-lag constraints to filter outliers
timelag_allowed_range = (-20, 19)
timelag_allowed_range_rest = (-40, 39)
offset_correction_threshold = 0.005
mono_dir = join(hts_label_root, "mono")
full_dir = join(hts_label_root, "full")
logger.info("Make aligned full context labels")
### Make aligned full context labels
# Note: this will be saved under hts_label_root directory
full_align_dir = join(hts_label_root, "full_align")
os.makedirs(full_align_dir, exist_ok=True)
mono_lab_files = sorted(glob(join(mono_dir, "*.lab")))
full_lab_files = sorted(glob(join(full_dir, "*.lab")))
for m, f in zip(mono_lab_files, full_lab_files):
mono_lab = hts.load(m)
full_lab = hts.load(f)
assert len(mono_lab) == len(full_lab)
full_lab.start_times = mono_lab.start_times
full_lab.end_times = mono_lab.end_times
name = basename(m)
dst_path = join(full_align_dir, name)
with open(dst_path, "w") as of:
of.write(str(full_lab))
### Prepare data for time-lag models
logger.info("Prepare data for time-lag models")
dst_dir = join(out_dir, "timelag")
lab_align_dst_dir = join(dst_dir, "label_phone_align")
lab_score_dst_dir = join(dst_dir, "label_phone_score")
for d in [lab_align_dst_dir, lab_score_dst_dir]:
os.makedirs(d, exist_ok=True)
print("Prepare data for time-lag models")
full_lab_align_files = sorted(glob(join(full_align_dir, "*.lab")))
for lab_align_path in full_lab_align_files:
lab_score_path = join(full_dir, basename(lab_align_path))
assert exists(lab_score_path)
name = basename(lab_align_path)
lab_align = hts.load(lab_align_path)
lab_score = hts.load(lab_score_path)
# Extract note onsets and let's compute a offset
note_indices = get_note_indices(lab_score)
onset_align = np.asarray(lab_align[note_indices].start_times)
onset_score = np.asarray(lab_score[note_indices].start_times)
global_offset = (onset_align - onset_score).mean()
global_offset = int(round(global_offset / 50000) * 50000)
# Apply offset correction only when there is a big gap
apply_offset_correction = np.abs(global_offset * 1e-7) > offset_correction_threshold
if apply_offset_correction:
print("{}: Global offset (in sec): {}".format(name,global_offset * 1e-7))
lab_score.start_times = list(np.asarray(lab_score.start_times) + global_offset)
lab_score.end_times = list(np.asarray(lab_score.end_times) + global_offset)
onset_score += global_offset
# Exclude large diff parts (probably a bug of musicxml or alignment though)
valid_note_indices = []
for idx, (a, b) in enumerate(zip(onset_align, onset_score)):
note_idx = note_indices[idx]
lag = np.abs(a - b) / 50000
if _is_silence(lab_score.contexts[note_idx]):
if lag >= timelag_allowed_range_rest[0] and lag <= timelag_allowed_range_rest[1]:
valid_note_indices.append(note_idx)
else:
if lag >= timelag_allowed_range[0] and lag <= timelag_allowed_range[1]:
valid_note_indices.append(note_idx)
if len(valid_note_indices) < len(note_indices):
D = len(note_indices) - len(valid_note_indices)
print("{}: {}/{} time-lags are excluded.".format(name,D,len(note_indices)))
# Note onsets as labels
lab_align = lab_align[valid_note_indices]
lab_score = lab_score[valid_note_indices]
# Save lab files
lab_align_dst_path = join(lab_align_dst_dir, name)
with open(lab_align_dst_path, "w") as of:
of.write(str(lab_align))
lab_score_dst_path = join(lab_score_dst_dir, name)
with open(lab_score_dst_path, "w") as of:
of.write(str(lab_score))
### Prepare data for duration models
logger.info("Prepare data for duration models")
dst_dir = join(out_dir, "duration")
lab_align_dst_dir = join(dst_dir, "label_phone_align")
for d in [lab_align_dst_dir]:
os.makedirs(d, exist_ok=True)
print("Prepare data for duration models")
full_lab_align_files = sorted(glob(join(full_align_dir, "*.lab")))
for lab_align_path in full_lab_align_files:
name = basename(lab_align_path)
lab_align = hts.load(lab_align_path)
# Save lab file
lab_align_dst_path = join(lab_align_dst_dir, name)
with open(lab_align_dst_path, "w") as of:
of.write(str(lab_align))
### Prepare data for acoustic models
logger.info("Prepare data for acoustic models")
dst_dir = join(out_dir, "acoustic")
wav_dst_dir = join(dst_dir, "wav")
lab_align_dst_dir = join(dst_dir, "label_phone_align")
lab_score_dst_dir = join(dst_dir, "label_phone_score")
for d in [wav_dst_dir, lab_align_dst_dir, lab_score_dst_dir]:
os.makedirs(d, exist_ok=True)
print("Prepare data for acoustic models")
full_lab_align_files = sorted(glob(join(full_align_dir, "*.lab")))
for lab_align_path in full_lab_align_files:
name = splitext(basename(lab_align_path))[0]
lab_score_path = join(full_dir, name + ".lab")
assert exists(lab_score_path)
wav_path = join(hts_demo_root, "data", "wav", name + ".wav")
raw_path = join(hts_demo_root, "data", "raw", name + ".raw")
# We can load and manupulate audio (e.g., normalizing gain), but for now just copy it as is
if exists(wav_path):
# sr, wave = wavfile.read(wav_path)
wav, sr = librosa.load(wav_path, sr=48000)
else:
assert raw_path
wav = np.fromfile(raw_path, dtype=np.int16)
wav = wav.astype(np.float32) / 2**15
sr = 48000
if gain_normalize:
wav = wav / wav.max() * 0.99
lab_align = hts.load(lab_align_path)
lab_score = hts.load(lab_score_path)
# Save caudio
wav_dst_path = join(wav_dst_dir, name + ".wav")
# TODO: consider explicit subtype
sf.write(wav_dst_path, wav, sr)
# Save label
lab_align_dst_path = join(lab_align_dst_dir, name + ".lab")
with open(lab_align_dst_path, "w") as of:
of.write(str(lab_align))
lab_score_dst_path = join(lab_score_dst_dir, name + ".lab")
with open(lab_score_dst_path, "w") as of:
of.write(str(lab_score))
AI助手
- 运行
$ data_prep_function(properties['hts_root'], properties['out_dir'], properties['gain_normalize'],logger)
AI助手
全部评论 (0)
还没有任何评论哟~
