322 lines
12 KiB
Python
322 lines
12 KiB
Python
import os
|
|
import time
|
|
import math
|
|
import argparse
|
|
import traceback
|
|
import subprocess
|
|
import numpy as np
|
|
from jamo import h2j
|
|
import tensorflow as tf
|
|
from datetime import datetime
|
|
from functools import partial
|
|
|
|
from hparams import hparams, hparams_debug_string
|
|
from models import create_model, get_most_recent_checkpoint
|
|
|
|
from utils import ValueWindow, prepare_dirs
|
|
from utils import infolog, warning, plot, load_hparams
|
|
from utils import get_git_revision_hash, get_git_diff, str2bool, parallel_run
|
|
|
|
from audio import save_audio, inv_spectrogram
|
|
from text import sequence_to_text, text_to_sequence
|
|
from datasets.datafeeder import DataFeeder, _prepare_inputs
|
|
|
|
log = infolog.log
|
|
|
|
|
|
def create_batch_inputs_from_texts(texts):
|
|
sequences = [text_to_sequence(text) for text in texts]
|
|
|
|
inputs = _prepare_inputs(sequences)
|
|
input_lengths = np.asarray([len(x) for x in inputs], dtype=np.int32)
|
|
|
|
for idx, (seq, text) in enumerate(zip(inputs, texts)):
|
|
recovered_text = sequence_to_text(seq, skip_eos_and_pad=True)
|
|
if recovered_text != h2j(text):
|
|
log(" [{}] {}".format(idx, text))
|
|
log(" [{}] {}".format(idx, recovered_text))
|
|
log("="*30)
|
|
|
|
return inputs, input_lengths
|
|
|
|
|
|
def get_git_commit():
|
|
subprocess.check_output(['git', 'diff-index', '--quiet', 'HEAD']) # Verify client is clean
|
|
commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()[:10]
|
|
log('Git commit: %s' % commit)
|
|
return commit
|
|
|
|
|
|
def add_stats(model, model2=None, scope_name='train'):
|
|
with tf.variable_scope(scope_name) as scope:
|
|
summaries = [
|
|
tf.summary.scalar('loss_mel', model.mel_loss),
|
|
tf.summary.scalar('loss_linear', model.linear_loss),
|
|
tf.summary.scalar('loss', model.loss_without_coeff),
|
|
]
|
|
|
|
if scope_name == 'train':
|
|
gradient_norms = [tf.norm(grad) for grad in model.gradients if grad is not None]
|
|
|
|
summaries.extend([
|
|
tf.summary.scalar('learning_rate', model.learning_rate),
|
|
tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms)),
|
|
])
|
|
|
|
if model2 is not None:
|
|
with tf.variable_scope('gap_test-train') as scope:
|
|
summaries.extend([
|
|
tf.summary.scalar('loss_mel',
|
|
model.mel_loss - model2.mel_loss),
|
|
tf.summary.scalar('loss_linear',
|
|
model.linear_loss - model2.linear_loss),
|
|
tf.summary.scalar('loss',
|
|
model.loss_without_coeff - model2.loss_without_coeff),
|
|
])
|
|
|
|
return tf.summary.merge(summaries)
|
|
|
|
|
|
def save_and_plot_fn(args, log_dir, step, loss, prefix):
|
|
idx, (seq, spec, align) = args
|
|
|
|
audio_path = os.path.join(
|
|
log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
|
|
align_path = os.path.join(
|
|
log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx))
|
|
|
|
waveform = inv_spectrogram(spec.T)
|
|
save_audio(waveform, audio_path)
|
|
|
|
info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
|
|
plot.plot_alignment(
|
|
align, align_path, info=info_text,
|
|
text=sequence_to_text(seq,
|
|
skip_eos_and_pad=True, combine_jamo=True))
|
|
|
|
def save_and_plot(sequences, spectrograms,
|
|
alignments, log_dir, step, loss, prefix):
|
|
|
|
fn = partial(save_and_plot_fn,
|
|
log_dir=log_dir, step=step, loss=loss, prefix=prefix)
|
|
items = list(enumerate(zip(sequences, spectrograms, alignments)))
|
|
|
|
parallel_run(fn, items, parallel=False)
|
|
log('Test finished for step {}.'.format(step))
|
|
|
|
|
|
def train(log_dir, config):
|
|
config.data_paths = config.data_paths
|
|
|
|
data_dirs = [os.path.join(data_path, "data") \
|
|
for data_path in config.data_paths]
|
|
num_speakers = len(data_dirs)
|
|
config.num_test = config.num_test_per_speaker * num_speakers
|
|
|
|
if num_speakers > 1 and hparams.model_type not in ["deepvoice", "simple"]:
|
|
raise Exception("[!] Unkown model_type for multi-speaker: {}".format(config.model_type))
|
|
|
|
commit = get_git_commit() if config.git else 'None'
|
|
checkpoint_path = os.path.join(log_dir, 'model.ckpt')
|
|
|
|
log(' [*] git recv-parse HEAD:\n%s' % get_git_revision_hash())
|
|
log('='*50)
|
|
log(' [*] dit diff:\n%s' % get_git_diff())
|
|
log('='*50)
|
|
log(' [*] Checkpoint path: %s' % checkpoint_path)
|
|
log(' [*] Loading training data from: %s' % data_dirs)
|
|
log(' [*] Using model: %s' % config.model_dir)
|
|
log(hparams_debug_string())
|
|
|
|
# Set up DataFeeder:
|
|
coord = tf.train.Coordinator()
|
|
with tf.variable_scope('datafeeder') as scope:
|
|
train_feeder = DataFeeder(
|
|
coord, data_dirs, hparams, config, 32,
|
|
data_type='train', batch_size=hparams.batch_size)
|
|
test_feeder = DataFeeder(
|
|
coord, data_dirs, hparams, config, 8,
|
|
data_type='test', batch_size=config.num_test)
|
|
|
|
# Set up model:
|
|
is_randomly_initialized = config.initialize_path is None
|
|
global_step = tf.Variable(0, name='global_step', trainable=False)
|
|
|
|
with tf.variable_scope('model') as scope:
|
|
model = create_model(hparams)
|
|
model.initialize(
|
|
train_feeder.inputs, train_feeder.input_lengths,
|
|
num_speakers, train_feeder.speaker_id,
|
|
train_feeder.mel_targets, train_feeder.linear_targets,
|
|
train_feeder.loss_coeff,
|
|
is_randomly_initialized=is_randomly_initialized)
|
|
|
|
model.add_loss()
|
|
model.add_optimizer(global_step)
|
|
train_stats = add_stats(model, scope_name='stats') # legacy
|
|
|
|
with tf.variable_scope('model', reuse=True) as scope:
|
|
test_model = create_model(hparams)
|
|
test_model.initialize(
|
|
test_feeder.inputs, test_feeder.input_lengths,
|
|
num_speakers, test_feeder.speaker_id,
|
|
test_feeder.mel_targets, test_feeder.linear_targets,
|
|
test_feeder.loss_coeff, rnn_decoder_test_mode=True,
|
|
is_randomly_initialized=is_randomly_initialized)
|
|
test_model.add_loss()
|
|
|
|
test_stats = add_stats(test_model, model, scope_name='test')
|
|
test_stats = tf.summary.merge([test_stats, train_stats])
|
|
|
|
# Bookkeeping:
|
|
step = 0
|
|
time_window = ValueWindow(100)
|
|
loss_window = ValueWindow(100)
|
|
saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2)
|
|
|
|
sess_config = tf.ConfigProto(
|
|
log_device_placement=False,
|
|
allow_soft_placement=True)
|
|
sess_config.gpu_options.allow_growth=True
|
|
|
|
# Train!
|
|
#with tf.Session(config=sess_config) as sess:
|
|
with tf.Session() as sess:
|
|
try:
|
|
summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
|
|
sess.run(tf.global_variables_initializer())
|
|
|
|
if config.load_path:
|
|
# Restore from a checkpoint if the user requested it.
|
|
restore_path = get_most_recent_checkpoint(config.model_dir)
|
|
saver.restore(sess, restore_path)
|
|
log('Resuming from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True)
|
|
elif config.initialize_path:
|
|
restore_path = get_most_recent_checkpoint(config.initialize_path)
|
|
saver.restore(sess, restore_path)
|
|
log('Initialized from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True)
|
|
|
|
zero_step_assign = tf.assign(global_step, 0)
|
|
sess.run(zero_step_assign)
|
|
|
|
start_step = sess.run(global_step)
|
|
log('='*50)
|
|
log(' [*] Global step is reset to {}'. \
|
|
format(start_step))
|
|
log('='*50)
|
|
else:
|
|
log('Starting new training run at commit: %s' % commit, slack=True)
|
|
|
|
start_step = sess.run(global_step)
|
|
|
|
train_feeder.start_in_session(sess, start_step)
|
|
test_feeder.start_in_session(sess, start_step)
|
|
|
|
while not coord.should_stop():
|
|
start_time = time.time()
|
|
step, loss, opt = sess.run(
|
|
[global_step, model.loss_without_coeff, model.optimize],
|
|
feed_dict=model.get_dummy_feed_dict())
|
|
|
|
time_window.append(time.time() - start_time)
|
|
loss_window.append(loss)
|
|
|
|
message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % (
|
|
step, time_window.average, loss, loss_window.average)
|
|
log(message, slack=(step % config.checkpoint_interval == 0))
|
|
|
|
if loss > 100 or math.isnan(loss):
|
|
log('Loss exploded to %.05f at step %d!' % (loss, step), slack=True)
|
|
raise Exception('Loss Exploded')
|
|
|
|
if step % config.summary_interval == 0:
|
|
log('Writing summary at step: %d' % step)
|
|
|
|
feed_dict = {
|
|
**model.get_dummy_feed_dict(),
|
|
**test_model.get_dummy_feed_dict()
|
|
}
|
|
summary_writer.add_summary(sess.run(
|
|
test_stats, feed_dict=feed_dict), step)
|
|
|
|
if step % config.checkpoint_interval == 0:
|
|
log('Saving checkpoint to: %s-%d' % (checkpoint_path, step))
|
|
saver.save(sess, checkpoint_path, global_step=step)
|
|
|
|
if step % config.test_interval == 0:
|
|
log('Saving audio and alignment...')
|
|
num_test = config.num_test
|
|
|
|
fetches = [
|
|
model.inputs[:num_test],
|
|
model.linear_outputs[:num_test],
|
|
model.alignments[:num_test],
|
|
test_model.inputs[:num_test],
|
|
test_model.linear_outputs[:num_test],
|
|
test_model.alignments[:num_test],
|
|
]
|
|
feed_dict = {
|
|
**model.get_dummy_feed_dict(),
|
|
**test_model.get_dummy_feed_dict()
|
|
}
|
|
|
|
sequences, spectrograms, alignments, \
|
|
test_sequences, test_spectrograms, test_alignments = \
|
|
sess.run(fetches, feed_dict=feed_dict)
|
|
|
|
save_and_plot(sequences[:1], spectrograms[:1], alignments[:1],
|
|
log_dir, step, loss, "train")
|
|
save_and_plot(test_sequences, test_spectrograms, test_alignments,
|
|
log_dir, step, loss, "test")
|
|
|
|
except Exception as e:
|
|
log('Exiting due to exception: %s' % e, slack=True)
|
|
traceback.print_exc()
|
|
coord.request_stop(e)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('--log_dir', default='logs')
|
|
parser.add_argument('--data_paths', default='datasets/kr_example')
|
|
parser.add_argument('--load_path', default=None)
|
|
parser.add_argument('--initialize_path', default=None)
|
|
|
|
parser.add_argument('--num_test_per_speaker', type=int, default=2)
|
|
parser.add_argument('--random_seed', type=int, default=123)
|
|
parser.add_argument('--summary_interval', type=int, default=100)
|
|
parser.add_argument('--test_interval', type=int, default=500)
|
|
parser.add_argument('--checkpoint_interval', type=int, default=1000)
|
|
parser.add_argument('--skip_path_filter',
|
|
type=str2bool, default=False, help='Use only for debugging')
|
|
|
|
parser.add_argument('--slack_url',
|
|
help='Slack webhook URL to get periodic reports.')
|
|
parser.add_argument('--git', action='store_true',
|
|
help='If set, verify that the client is clean.')
|
|
|
|
config = parser.parse_args()
|
|
config.data_paths = config.data_paths.split(",")
|
|
setattr(hparams, "num_speakers", len(config.data_paths))
|
|
|
|
prepare_dirs(config, hparams)
|
|
|
|
log_path = os.path.join(config.model_dir, 'train.log')
|
|
infolog.init(log_path, config.model_dir, config.slack_url)
|
|
|
|
tf.set_random_seed(config.random_seed)
|
|
|
|
if any("krbook" not in data_path for data_path in config.data_paths) and \
|
|
hparams.sample_rate != 20000:
|
|
warning("Detect non-krbook dataset. Set sampling rate from {} to 20000".\
|
|
format(hparams.sample_rate))
|
|
|
|
if config.load_path is not None and config.initialize_path is not None:
|
|
raise Exception(" [!] Only one of load_path and initialize_path should be set")
|
|
|
|
train(config.model_dir, config)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|