156 lines
3.6 KiB
Python
156 lines
3.6 KiB
Python
import tensorflow as tf
|
|
|
|
SCALE_FACTOR = 1
|
|
|
|
def f(num):
|
|
return num // SCALE_FACTOR
|
|
|
|
basic_params = {
|
|
# Comma-separated list of cleaners to run on text prior to training and eval. For non-English
|
|
# text, you may want to use "basic_cleaners" or "transliteration_cleaners" See TRAINING_DATA.md.
|
|
'cleaners': 'korean_cleaners',
|
|
}
|
|
|
|
basic_params.update({
|
|
# Audio
|
|
'num_mels': 80,
|
|
'num_freq': 1025,
|
|
'sample_rate': 20000,
|
|
'frame_length_ms': 50,
|
|
'frame_shift_ms': 12.5,
|
|
'preemphasis': 0.97,
|
|
'min_level_db': -100,
|
|
'ref_level_db': 20,
|
|
})
|
|
|
|
if True:
|
|
basic_params.update({
|
|
'sample_rate': 24000,
|
|
})
|
|
|
|
basic_params.update({
|
|
# Model
|
|
'model_type': 'single', # [single, simple, deepvoice]
|
|
'speaker_embedding_size': f(16),
|
|
|
|
'embedding_size': f(256),
|
|
'dropout_prob': 0.5,
|
|
|
|
# Encoder
|
|
'enc_prenet_sizes': [f(256), f(128)],
|
|
'enc_bank_size': 16,
|
|
'enc_bank_channel_size': f(128),
|
|
'enc_maxpool_width': 2,
|
|
'enc_highway_depth': 4,
|
|
'enc_rnn_size': f(128),
|
|
'enc_proj_sizes': [f(128), f(128)],
|
|
'enc_proj_width': 3,
|
|
|
|
# Attention
|
|
'attention_type': 'bah_mon', # ntm2-5
|
|
'attention_size': f(256),
|
|
'attention_state_size': f(256),
|
|
|
|
# Decoder recurrent network
|
|
'dec_layer_num': 2,
|
|
'dec_rnn_size': f(256),
|
|
|
|
# Decoder
|
|
'dec_prenet_sizes': [f(256), f(128)],
|
|
'post_bank_size': 8,
|
|
'post_bank_channel_size': f(256),
|
|
'post_maxpool_width': 2,
|
|
'post_highway_depth': 4,
|
|
'post_rnn_size': f(128),
|
|
'post_proj_sizes': [f(256), 80], # num_mels=80
|
|
'post_proj_width': 3,
|
|
|
|
'reduction_factor': 4,
|
|
})
|
|
|
|
if False: # Deep Voice 2
|
|
basic_params.update({
|
|
'dropout_prob': 0.8,
|
|
|
|
'attention_size': f(512),
|
|
|
|
'dec_prenet_sizes': [f(256), f(128), f(64)],
|
|
'post_bank_channel_size': f(512),
|
|
'post_rnn_size': f(256),
|
|
|
|
'reduction_factor': 4,
|
|
})
|
|
elif True: # Deep Voice 2
|
|
basic_params.update({
|
|
'dropout_prob': 0.8,
|
|
|
|
#'attention_size': f(512),
|
|
|
|
#'dec_prenet_sizes': [f(256), f(128)],
|
|
#'post_bank_channel_size': f(512),
|
|
'post_rnn_size': f(256),
|
|
|
|
'reduction_factor': 4,
|
|
})
|
|
elif False: # Single Speaker
|
|
basic_params.update({
|
|
'dropout_prob': 0.5,
|
|
|
|
'attention_size': f(128),
|
|
|
|
'post_bank_channel_size': f(128),
|
|
#'post_rnn_size': f(128),
|
|
|
|
'reduction_factor': 4,
|
|
})
|
|
elif False: # Single Speaker with generalization
|
|
basic_params.update({
|
|
'dropout_prob': 0.8,
|
|
|
|
'attention_size': f(256),
|
|
|
|
'dec_prenet_sizes': [f(256), f(128), f(64)],
|
|
'post_bank_channel_size': f(128),
|
|
'post_rnn_size': f(128),
|
|
|
|
'reduction_factor': 4,
|
|
})
|
|
|
|
|
|
basic_params.update({
|
|
# Training
|
|
'batch_size': 16,
|
|
'adam_beta1': 0.9,
|
|
'adam_beta2': 0.999,
|
|
'use_fixed_test_inputs': False,
|
|
|
|
'initial_learning_rate': 0.002,
|
|
'decay_learning_rate_mode': 0,
|
|
'initial_data_greedy': True,
|
|
'initial_phase_step': 8000,
|
|
'main_data_greedy_factor': 0,
|
|
'main_data': [''],
|
|
'prioritize_loss': False,
|
|
|
|
'recognition_loss_coeff': 0.2,
|
|
'ignore_recognition_level': 1, # 0: use all, 1: ignore only unmatched_alignment, 2: fully ignore recognition
|
|
|
|
# Eval
|
|
'min_tokens': 50,
|
|
'min_iters': 30,
|
|
'max_iters': 200,
|
|
'skip_inadequate': False,
|
|
|
|
'griffin_lim_iters': 60,
|
|
'power': 1.5, # Power to raise magnitudes to prior to Griffin-Lim
|
|
})
|
|
|
|
|
|
# Default hyperparameters:
|
|
hparams = tf.contrib.training.HParams(**basic_params)
|
|
|
|
|
|
def hparams_debug_string():
|
|
values = hparams.values()
|
|
hp = [' %s: %s' % (name, values[name]) for name in sorted(values)]
|
|
return 'Hyperparameters:\n' + '\n'.join(hp)
|