multi-speaker-tacotron-tens.../hparams.py

157 lines
3.6 KiB
Python

import tensorflow as tf
SCALE_FACTOR = 1
def f(num):
return num // SCALE_FACTOR
basic_params = {
# Comma-separated list of cleaners to run on text prior to training and eval. For non-English
# text, you may want to use "basic_cleaners" or "transliteration_cleaners" See TRAINING_DATA.md.
'cleaners': 'korean_cleaners',
}
basic_params.update({
# Audio
'num_mels': 80,
'num_freq': 1025,
'sample_rate': 20000,
'frame_length_ms': 50,
'frame_shift_ms': 12.5,
'preemphasis': 0.97,
'min_level_db': -100,
'ref_level_db': 20,
})
if True:
basic_params.update({
'sample_rate': 24000,
})
basic_params.update({
# Model
'model_type': 'single', # [single, simple, deepvoice]
'speaker_embedding_size': f(16),
'embedding_size': f(256),
'dropout_prob': 0.5,
# Encoder
'enc_prenet_sizes': [f(256), f(128)],
'enc_bank_size': 16,
'enc_bank_channel_size': f(128),
'enc_maxpool_width': 2,
'enc_highway_depth': 4,
'enc_rnn_size': f(128),
'enc_proj_sizes': [f(128), f(128)],
'enc_proj_width': 3,
# Attention
'attention_type': 'bah_mon', # ntm2-5
'attention_size': f(256),
'attention_state_size': f(256),
# Decoder recurrent network
'dec_layer_num': 2,
'dec_rnn_size': f(256),
# Decoder
'dec_prenet_sizes': [f(256), f(128)],
'post_bank_size': 8,
'post_bank_channel_size': f(256),
'post_maxpool_width': 2,
'post_highway_depth': 4,
'post_rnn_size': f(128),
'post_proj_sizes': [f(256), 80], # num_mels=80
'post_proj_width': 3,
'reduction_factor': 4,
})
if False: # Deep Voice 2
basic_params.update({
'dropout_prob': 0.8,
'attention_size': f(512),
'dec_prenet_sizes': [f(256), f(128), f(64)],
'post_bank_channel_size': f(512),
'post_rnn_size': f(256),
'reduction_factor': 4,
})
elif True: # Deep Voice 2
basic_params.update({
'dropout_prob': 0.8,
#'attention_size': f(512),
#'dec_prenet_sizes': [f(256), f(128)],
#'post_bank_channel_size': f(512),
'post_rnn_size': f(256),
'reduction_factor': 4,
})
elif False: # Single Speaker
basic_params.update({
'dropout_prob': 0.5,
'attention_size': f(128),
'post_bank_channel_size': f(128),
#'post_rnn_size': f(128),
'reduction_factor': 4,
})
elif False: # Single Speaker with generalization
basic_params.update({
'dropout_prob': 0.8,
'attention_size': f(256),
'dec_prenet_sizes': [f(256), f(128), f(64)],
'post_bank_channel_size': f(128),
'post_rnn_size': f(128),
'reduction_factor': 4,
})
basic_params.update({
# Training
'batch_size': 16,
'adam_beta1': 0.9,
'adam_beta2': 0.999,
'use_fixed_test_inputs': False,
'initial_learning_rate': 0.002,
'decay_learning_rate_mode': 0,
'initial_data_greedy': True,
'initial_phase_step': 8000,
'main_data_greedy_factor': 0,
'main_data': [''],
'prioritize_loss': False,
'recognition_loss_coeff': 0.2,
'ignore_recognition_level': 1, # 0: use all, 1: ignore only unmatched_alignment, 2: fully ignore recognition
# Eval
'min_tokens': 50,
'min_iters': 30,
'max_iters': 200,
'skip_inadequate': False,
'griffin_lim_iters': 60,
'power': 1.5, # Power to raise magnitudes to prior to Griffin-Lim
})
# Default hyperparameters:
hparams = tf.contrib.training.HParams(**basic_params)
def hparams_debug_string():
values = hparams.values()
hp = [' %s: %s' % (name, values[name]) for name in sorted(values)]
return 'Hyperparameters:\n' + '\n'.join(hp)