Advertisement

《歌声合成系列教程1》歌声合成数据预处理

阅读量:

该文本描述了一个用于语音合成和音频处理的数据准备函数,核心方法基于参数化合成歌声与Merlin训练语音模型相似,涉及时长模型、语言模型和音声模型的构建与应用。函数主要包括标注转换(如从HTK到Label文件)、时长校正(通过计算offset进行调整)、数据分割以及音频特征提取(如f0、声带活动度等),并支持gain归一化操作。该方法的目标是为后续的语音合成或音乐生成任务提供高质量的标注数据和音频特征。

文章目录

  • 思路介绍

    • 核心思路
  • 实现编码

思路介绍

核心思路

基于参数化合成歌声的方法类似于Merlin训练语音模型的方式,在这一过程中涉及时长模型、语言模型及音声模型等关键组件。
样本标注需求包括:
全标签(基于HTK的标注样本)、基于单声素时长标记以及从wav提取的语音特征(包含声带频率(f0)、声调(sp)和声门活动度(ap))。

复制代码
* 声码器:pyworld

实现编码

复制代码
    # -*- coding:utf-8 -*-
    # /usr/bin/python
    '''
    -------------------------------------------------
       File Name   :  data_prep
       Description :  AIM: 
                  Functions: 1. 
                             2. 
       Functions   :
       Envs        :  python == 
                  pip install  
       Date        :  2020/11/4  20:23
       CodeStyle   :  规范,简洁,易懂,可阅读,可维护,可移植!
    -------------------------------------------------
       Change Activity:
          2020/11/4 : tag
    -------------------------------------------------
    __Author__ = "Yan Errol"
    __Email__ = "2681506@gmail.com"
    __Copyright__ = "Copyright 2020, Yan Errol"
    -------------------------------------------------
    '''
    
    # coding: utf-8
    import os
    
    import argparse
    from glob import glob
    from os.path import join, basename, splitext, exists, expanduser
    from nnmnkwii.io import hts
    from scipy.io import wavfile
    import librosa
    import soundfile as sf
    import sys
    import numpy as np
    
    from nnsvs.io.hts import get_note_indices
    
    
    def _is_silence(l):
    is_full_context = "@" in l
    if is_full_context:
        is_silence = ("-sil" in l or "-pau" in l)
    else:
        is_silence = (l == "sil" or l == "pau")
    return is_silence
    
    
    def remove_sil_and_pau(lab):
    newlab = hts.HTSLabelFile()
    for l in lab:
        if "-sil" not in l[-1] and "-pau" not in l[-1]:
            newlab.append(l, strict=False)
    
    return newlab
    
    
    
    def data_prep_function(hts_demo_root,out_dir,gain_normalize,logger):
    '''函数封装'''
    hts_label_root = join(hts_demo_root, "data/labels")
    # Time-lag constraints to filter outliers
    timelag_allowed_range = (-20, 19)
    timelag_allowed_range_rest = (-40, 39)
    
    offset_correction_threshold = 0.005
    
    mono_dir = join(hts_label_root, "mono")
    full_dir = join(hts_label_root, "full")
    logger.info("Make aligned full context labels")
    ### Make aligned full context labels
    
    # Note: this will be saved under hts_label_root directory
    full_align_dir = join(hts_label_root, "full_align")
    os.makedirs(full_align_dir, exist_ok=True)
    
    mono_lab_files = sorted(glob(join(mono_dir, "*.lab")))
    full_lab_files = sorted(glob(join(full_dir, "*.lab")))
    for m, f in zip(mono_lab_files, full_lab_files):
        mono_lab = hts.load(m)
        full_lab = hts.load(f)
        assert len(mono_lab) == len(full_lab)
        full_lab.start_times = mono_lab.start_times
        full_lab.end_times = mono_lab.end_times
        name = basename(m)
        dst_path = join(full_align_dir, name)
        with open(dst_path, "w") as of:
            of.write(str(full_lab))
    
    ### Prepare data for time-lag models
    logger.info("Prepare data for time-lag models")
    
    dst_dir = join(out_dir, "timelag")
    lab_align_dst_dir  = join(dst_dir, "label_phone_align")
    lab_score_dst_dir  = join(dst_dir, "label_phone_score")
    
    for d in [lab_align_dst_dir, lab_score_dst_dir]:
        os.makedirs(d, exist_ok=True)
    
    print("Prepare data for time-lag models")
    full_lab_align_files = sorted(glob(join(full_align_dir, "*.lab")))
    for lab_align_path in full_lab_align_files:
        lab_score_path = join(full_dir, basename(lab_align_path))
        assert exists(lab_score_path)
        name = basename(lab_align_path)
    
        lab_align = hts.load(lab_align_path)
        lab_score = hts.load(lab_score_path)
    
        # Extract note onsets and let's compute a offset
        note_indices = get_note_indices(lab_score)
    
        onset_align = np.asarray(lab_align[note_indices].start_times)
        onset_score = np.asarray(lab_score[note_indices].start_times)
    
        global_offset = (onset_align - onset_score).mean()
        global_offset = int(round(global_offset / 50000) * 50000)
    
        # Apply offset correction only when there is a big gap
        apply_offset_correction = np.abs(global_offset * 1e-7) > offset_correction_threshold
        if apply_offset_correction:
            print("{}: Global offset (in sec): {}".format(name,global_offset * 1e-7))
            lab_score.start_times = list(np.asarray(lab_score.start_times) + global_offset)
            lab_score.end_times = list(np.asarray(lab_score.end_times) + global_offset)
            onset_score += global_offset
    
        # Exclude large diff parts (probably a bug of musicxml or alignment though)
        valid_note_indices = []
        for idx, (a, b) in enumerate(zip(onset_align, onset_score)):
            note_idx = note_indices[idx]
            lag = np.abs(a - b) / 50000
            if _is_silence(lab_score.contexts[note_idx]):
                if lag >= timelag_allowed_range_rest[0] and lag <= timelag_allowed_range_rest[1]:
                    valid_note_indices.append(note_idx)
            else:
                if lag >= timelag_allowed_range[0] and lag <= timelag_allowed_range[1]:
                    valid_note_indices.append(note_idx)
    
        if len(valid_note_indices) < len(note_indices):
            D = len(note_indices) - len(valid_note_indices)
            print("{}: {}/{} time-lags are excluded.".format(name,D,len(note_indices)))
    
        # Note onsets as labels
        lab_align = lab_align[valid_note_indices]
        lab_score = lab_score[valid_note_indices]
    
        # Save lab files
        lab_align_dst_path = join(lab_align_dst_dir, name)
        with open(lab_align_dst_path, "w") as of:
            of.write(str(lab_align))
    
        lab_score_dst_path = join(lab_score_dst_dir, name)
        with open(lab_score_dst_path, "w") as of:
            of.write(str(lab_score))
    
    ### Prepare data for duration models
    logger.info("Prepare data for duration models")
    
    dst_dir = join(out_dir, "duration")
    lab_align_dst_dir  = join(dst_dir, "label_phone_align")
    
    for d in [lab_align_dst_dir]:
        os.makedirs(d, exist_ok=True)
    
    print("Prepare data for duration models")
    full_lab_align_files = sorted(glob(join(full_align_dir, "*.lab")))
    for lab_align_path in full_lab_align_files:
        name = basename(lab_align_path)
    
        lab_align = hts.load(lab_align_path)
    
        # Save lab file
        lab_align_dst_path = join(lab_align_dst_dir, name)
        with open(lab_align_dst_path, "w") as of:
            of.write(str(lab_align))
    
    
    ### Prepare data for acoustic models
    logger.info("Prepare data for acoustic models")
    
    dst_dir = join(out_dir, "acoustic")
    wav_dst_dir  = join(dst_dir, "wav")
    lab_align_dst_dir  = join(dst_dir, "label_phone_align")
    lab_score_dst_dir  = join(dst_dir, "label_phone_score")
    
    for d in [wav_dst_dir, lab_align_dst_dir, lab_score_dst_dir]:
        os.makedirs(d, exist_ok=True)
    
    print("Prepare data for acoustic models")
    full_lab_align_files = sorted(glob(join(full_align_dir, "*.lab")))
    for lab_align_path in full_lab_align_files:
        name = splitext(basename(lab_align_path))[0]
        lab_score_path = join(full_dir, name + ".lab")
        assert exists(lab_score_path)
        wav_path = join(hts_demo_root, "data", "wav", name + ".wav")
        raw_path = join(hts_demo_root, "data", "raw", name + ".raw")
    
        # We can load and manupulate audio (e.g., normalizing gain), but for now just copy it as is
        if exists(wav_path):
            # sr, wave = wavfile.read(wav_path)
            wav, sr = librosa.load(wav_path, sr=48000)
        else:
            assert raw_path
            wav = np.fromfile(raw_path, dtype=np.int16)
            wav = wav.astype(np.float32) / 2**15
            sr = 48000
    
        if gain_normalize:
            wav = wav / wav.max() * 0.99
    
        lab_align = hts.load(lab_align_path)
        lab_score = hts.load(lab_score_path)
    
        # Save caudio
        wav_dst_path = join(wav_dst_dir, name + ".wav")
        # TODO: consider explicit subtype
        sf.write(wav_dst_path, wav, sr)
    
        # Save label
        lab_align_dst_path = join(lab_align_dst_dir, name + ".lab")
        with open(lab_align_dst_path, "w") as of:
            of.write(str(lab_align))
    
        lab_score_dst_path = join(lab_score_dst_dir, name + ".lab")
        with open(lab_score_dst_path, "w") as of:
            of.write(str(lab_score))
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
    AI助手
  • 运行
复制代码
    $ data_prep_function(properties['hts_root'], properties['out_dir'], properties['gain_normalize'],logger)
    
    
      
    
    AI助手

全部评论 (0)

还没有任何评论哟~