328 lines
12 KiB
Python
328 lines
12 KiB
Python
import os
|
|
import time
|
|
import pprint
|
|
import random
|
|
import threading
|
|
import traceback
|
|
import numpy as np
|
|
from glob import glob
|
|
import tensorflow as tf
|
|
from collections import defaultdict
|
|
|
|
import text
|
|
from utils.infolog import log
|
|
from utils import parallel_run, remove_file
|
|
from audio import frames_to_hours
|
|
from audio.get_duration import get_durations
|
|
|
|
|
|
_pad = 0
|
|
|
|
def get_frame(path):
|
|
data = np.load(path)
|
|
n_frame = data["linear"].shape[0]
|
|
n_token = len(data["tokens"])
|
|
return (path, n_frame, n_token)
|
|
|
|
def get_path_dict(
|
|
data_dirs, hparams, config,
|
|
data_type, n_test=None,
|
|
rng=np.random.RandomState(123)):
|
|
|
|
# Load metadata:
|
|
path_dict = {}
|
|
for data_dir in data_dirs:
|
|
paths = glob("{}/*.npz".format(data_dir))
|
|
|
|
if data_type == 'train':
|
|
rng.shuffle(paths)
|
|
|
|
if not config.skip_path_filter:
|
|
items = parallel_run(
|
|
get_frame, paths, desc="filter_by_min_max_frame_batch", parallel=True)
|
|
|
|
min_n_frame = hparams.reduction_factor * hparams.min_iters
|
|
max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor
|
|
|
|
new_items = [(path, n) for path, n, n_tokens in items \
|
|
if min_n_frame <= n <= max_n_frame and n_tokens >= hparams.min_tokens]
|
|
|
|
if any(check in data_dir for check in ["son", "yuinna"]):
|
|
blacklists = [".0000.", ".0001.", "NB11479580.0001"]
|
|
new_items = [item for item in new_items \
|
|
if any(check not in item[0] for check in blacklists)]
|
|
|
|
new_paths = [path for path, n in new_items]
|
|
new_n_frames = [n for path, n in new_items]
|
|
|
|
hours = frames_to_hours(new_n_frames)
|
|
|
|
log(' [{}] Loaded metadata for {} examples ({:.2f} hours)'. \
|
|
format(data_dir, len(new_n_frames), hours))
|
|
log(' [{}] Max length: {}'.format(data_dir, max(new_n_frames)))
|
|
log(' [{}] Min length: {}'.format(data_dir, min(new_n_frames)))
|
|
else:
|
|
new_paths = paths
|
|
|
|
if data_type == 'train':
|
|
new_paths = new_paths[:-n_test]
|
|
elif data_type == 'test':
|
|
new_paths = new_paths[-n_test:]
|
|
else:
|
|
raise Exception(" [!] Unkown data_type: {}".format(data_type))
|
|
|
|
path_dict[data_dir] = new_paths
|
|
|
|
return path_dict
|
|
|
|
class DataFeeder(threading.Thread):
|
|
'''Feeds batches of data into a queue on a background thread.'''
|
|
|
|
def __init__(self, coordinator, data_dirs,
|
|
hparams, config, batches_per_group, data_type, batch_size):
|
|
super(DataFeeder, self).__init__()
|
|
|
|
self._coord = coordinator
|
|
self._hp = hparams
|
|
self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
|
|
self._step = 0
|
|
self._offset = defaultdict(lambda: 2)
|
|
self._batches_per_group = batches_per_group
|
|
|
|
self.rng = np.random.RandomState(config.random_seed)
|
|
self.data_type = data_type
|
|
self.batch_size = batch_size
|
|
|
|
self.min_tokens = hparams.min_tokens
|
|
self.min_n_frame = hparams.reduction_factor * hparams.min_iters
|
|
self.max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor
|
|
self.skip_path_filter = config.skip_path_filter
|
|
|
|
# Load metadata:
|
|
self.path_dict = get_path_dict(
|
|
data_dirs, self._hp, config, self.data_type,
|
|
n_test=self.batch_size, rng=self.rng)
|
|
|
|
self.data_dirs = list(self.path_dict.keys())
|
|
self.data_dir_to_id = {
|
|
data_dir: idx for idx, data_dir in enumerate(self.data_dirs)}
|
|
|
|
data_weight = {
|
|
data_dir: 1. for data_dir in self.data_dirs
|
|
}
|
|
|
|
if self._hp.main_data_greedy_factor > 0 and \
|
|
any(main_data in data_dir for data_dir in self.data_dirs \
|
|
for main_data in self._hp.main_data):
|
|
for main_data in self._hp.main_data:
|
|
for data_dir in self.data_dirs:
|
|
if main_data in data_dir:
|
|
data_weight[data_dir] += self._hp.main_data_greedy_factor
|
|
|
|
weight_Z = sum(data_weight.values())
|
|
self.data_ratio = {
|
|
data_dir: weight / weight_Z for data_dir, weight in data_weight.items()
|
|
}
|
|
|
|
log("="*40)
|
|
log(pprint.pformat(self.data_ratio, indent=4))
|
|
log("="*40)
|
|
|
|
#audio_paths = [path.replace("/data/", "/audio/"). \
|
|
# replace(".npz", ".wav") for path in self.data_paths]
|
|
#duration = get_durations(audio_paths, print_detail=False)
|
|
|
|
# Create placeholders for inputs and targets. Don't specify batch size because we want to
|
|
# be able to feed different sized batches at eval time.
|
|
|
|
self._placeholders = [
|
|
tf.placeholder(tf.int32, [None, None], 'inputs'),
|
|
tf.placeholder(tf.int32, [None], 'input_lengths'),
|
|
tf.placeholder(tf.float32, [None], 'loss_coeff'),
|
|
tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'),
|
|
tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets'),
|
|
]
|
|
|
|
# Create queue for buffering data:
|
|
dtypes = [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32]
|
|
|
|
self.is_multi_speaker = len(self.data_dirs) > 1
|
|
|
|
if self.is_multi_speaker:
|
|
self._placeholders.append(
|
|
tf.placeholder(tf.int32, [None], 'inputs'),
|
|
)
|
|
dtypes.append(tf.int32)
|
|
|
|
num_worker = 8 if self.data_type == 'train' else 1
|
|
queue = tf.FIFOQueue(num_worker, dtypes, name='input_queue')
|
|
|
|
self._enqueue_op = queue.enqueue(self._placeholders)
|
|
|
|
if self.is_multi_speaker:
|
|
self.inputs, self.input_lengths, self.loss_coeff, \
|
|
self.mel_targets, self.linear_targets, self.speaker_id = queue.dequeue()
|
|
else:
|
|
self.inputs, self.input_lengths, self.loss_coeff, \
|
|
self.mel_targets, self.linear_targets = queue.dequeue()
|
|
|
|
self.inputs.set_shape(self._placeholders[0].shape)
|
|
self.input_lengths.set_shape(self._placeholders[1].shape)
|
|
self.loss_coeff.set_shape(self._placeholders[2].shape)
|
|
self.mel_targets.set_shape(self._placeholders[3].shape)
|
|
self.linear_targets.set_shape(self._placeholders[4].shape)
|
|
|
|
if self.is_multi_speaker:
|
|
self.speaker_id.set_shape(self._placeholders[5].shape)
|
|
else:
|
|
self.speaker_id = None
|
|
|
|
if self.data_type == 'test':
|
|
examples = []
|
|
while True:
|
|
for data_dir in self.data_dirs:
|
|
examples.append(self._get_next_example(data_dir))
|
|
#print(data_dir, text.sequence_to_text(examples[-1][0], False, True))
|
|
if len(examples) >= self.batch_size:
|
|
break
|
|
if len(examples) >= self.batch_size:
|
|
break
|
|
self.static_batches = [examples for _ in range(self._batches_per_group)]
|
|
|
|
else:
|
|
self.static_batches = None
|
|
|
|
def start_in_session(self, session, start_step):
|
|
self._step = start_step
|
|
self._session = session
|
|
self.start()
|
|
|
|
|
|
def run(self):
|
|
try:
|
|
while not self._coord.should_stop():
|
|
self._enqueue_next_group()
|
|
except Exception as e:
|
|
traceback.print_exc()
|
|
self._coord.request_stop(e)
|
|
|
|
|
|
def _enqueue_next_group(self):
|
|
start = time.time()
|
|
|
|
# Read a group of examples:
|
|
n = self.batch_size
|
|
r = self._hp.reduction_factor
|
|
|
|
if self.static_batches is not None:
|
|
batches = self.static_batches
|
|
else:
|
|
examples = []
|
|
for data_dir in self.data_dirs:
|
|
if self._hp.initial_data_greedy:
|
|
if self._step < self._hp.initial_phase_step and \
|
|
any("krbook" in data_dir for data_dir in self.data_dirs):
|
|
data_dir = [data_dir for data_dir in self.data_dirs if "krbook" in data_dir][0]
|
|
|
|
if self._step < self._hp.initial_phase_step:
|
|
example = [self._get_next_example(data_dir) \
|
|
for _ in range(int(n * self._batches_per_group // len(self.data_dirs)))]
|
|
else:
|
|
example = [self._get_next_example(data_dir) \
|
|
for _ in range(int(n * self._batches_per_group * self.data_ratio[data_dir]))]
|
|
examples.extend(example)
|
|
examples.sort(key=lambda x: x[-1])
|
|
|
|
batches = [examples[i:i+n] for i in range(0, len(examples), n)]
|
|
self.rng.shuffle(batches)
|
|
|
|
log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start))
|
|
for batch in batches:
|
|
feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r, self.rng, self.data_type)))
|
|
self._session.run(self._enqueue_op, feed_dict=feed_dict)
|
|
self._step += 1
|
|
|
|
|
|
def _get_next_example(self, data_dir):
|
|
'''Loads a single example (input, mel_target, linear_target, cost) from disk'''
|
|
data_paths = self.path_dict[data_dir]
|
|
|
|
while True:
|
|
if self._offset[data_dir] >= len(data_paths):
|
|
self._offset[data_dir] = 0
|
|
|
|
if self.data_type == 'train':
|
|
self.rng.shuffle(data_paths)
|
|
|
|
data_path = data_paths[self._offset[data_dir]]
|
|
self._offset[data_dir] += 1
|
|
|
|
try:
|
|
if os.path.exists(data_path):
|
|
data = np.load(data_path)
|
|
else:
|
|
continue
|
|
except:
|
|
remove_file(data_path)
|
|
continue
|
|
|
|
if not self.skip_path_filter:
|
|
break
|
|
|
|
if self.min_n_frame <= data["linear"].shape[0] <= self.max_n_frame and \
|
|
len(data["tokens"]) > self.min_tokens:
|
|
break
|
|
|
|
input_data = data['tokens']
|
|
mel_target = data['mel']
|
|
|
|
if 'loss_coeff' in data:
|
|
loss_coeff = data['loss_coeff']
|
|
else:
|
|
loss_coeff = 1
|
|
linear_target = data['linear']
|
|
|
|
return (input_data, loss_coeff, mel_target, linear_target,
|
|
self.data_dir_to_id[data_dir], len(linear_target))
|
|
|
|
|
|
def _prepare_batch(batch, reduction_factor, rng, data_type=None):
|
|
if data_type == 'train':
|
|
rng.shuffle(batch)
|
|
|
|
inputs = _prepare_inputs([x[0] for x in batch])
|
|
input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32)
|
|
loss_coeff = np.asarray([x[1] for x in batch], dtype=np.float32)
|
|
|
|
mel_targets = _prepare_targets([x[2] for x in batch], reduction_factor)
|
|
linear_targets = _prepare_targets([x[3] for x in batch], reduction_factor)
|
|
|
|
if len(batch[0]) == 6:
|
|
speaker_id = np.asarray([x[4] for x in batch], dtype=np.int32)
|
|
return (inputs, input_lengths, loss_coeff,
|
|
mel_targets, linear_targets, speaker_id)
|
|
else:
|
|
return (inputs, input_lengths, loss_coeff, mel_targets, linear_targets)
|
|
|
|
|
|
def _prepare_inputs(inputs):
|
|
max_len = max((len(x) for x in inputs))
|
|
return np.stack([_pad_input(x, max_len) for x in inputs])
|
|
|
|
|
|
def _prepare_targets(targets, alignment):
|
|
max_len = max((len(t) for t in targets)) + 1
|
|
return np.stack([_pad_target(t, _round_up(max_len, alignment)) for t in targets])
|
|
|
|
|
|
def _pad_input(x, length):
|
|
return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
|
|
|
|
|
|
def _pad_target(t, length):
|
|
return np.pad(t, [(0, length - t.shape[0]), (0,0)], mode='constant', constant_values=_pad)
|
|
|
|
|
|
def _round_up(x, multiple):
|
|
remainder = x % multiple
|
|
return x if remainder == 0 else x + multiple - remainder
|