import os import time import pprint import random import threading import traceback import numpy as np from glob import glob import tensorflow as tf from collections import defaultdict import text from utils.infolog import log from utils import parallel_run, remove_file from audio import frames_to_hours from audio.get_duration import get_durations _pad = 0 def get_frame(path): data = np.load(path) n_frame = data["linear"].shape[0] n_token = len(data["tokens"]) return (path, n_frame, n_token) def get_path_dict( data_dirs, hparams, config, data_type, n_test=None, rng=np.random.RandomState(123)): # Load metadata: path_dict = {} for data_dir in data_dirs: paths = glob("{}/*.npz".format(data_dir)) if data_type == 'train': rng.shuffle(paths) if not config.skip_path_filter: items = parallel_run( get_frame, paths, desc="filter_by_min_max_frame_batch", parallel=True) min_n_frame = hparams.reduction_factor * hparams.min_iters max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor new_items = [(path, n) for path, n, n_tokens in items \ if min_n_frame <= n <= max_n_frame and n_tokens >= hparams.min_tokens] if any(check in data_dir for check in ["son", "yuinna"]): blacklists = [".0000.", ".0001.", "NB11479580.0001"] new_items = [item for item in new_items \ if any(check not in item[0] for check in blacklists)] new_paths = [path for path, n in new_items] new_n_frames = [n for path, n in new_items] hours = frames_to_hours(new_n_frames) log(' [{}] Loaded metadata for {} examples ({:.2f} hours)'. \ format(data_dir, len(new_n_frames), hours)) log(' [{}] Max length: {}'.format(data_dir, max(new_n_frames))) log(' [{}] Min length: {}'.format(data_dir, min(new_n_frames))) else: new_paths = paths if data_type == 'train': new_paths = new_paths[:-n_test] elif data_type == 'test': new_paths = new_paths[-n_test:] else: raise Exception(" [!] Unkown data_type: {}".format(data_type)) path_dict[data_dir] = new_paths return path_dict class DataFeeder(threading.Thread): '''Feeds batches of data into a queue on a background thread.''' def __init__(self, coordinator, data_dirs, hparams, config, batches_per_group, data_type, batch_size): super(DataFeeder, self).__init__() self._coord = coordinator self._hp = hparams self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] self._step = 0 self._offset = defaultdict(lambda: 2) self._batches_per_group = batches_per_group self.rng = np.random.RandomState(config.random_seed) self.data_type = data_type self.batch_size = batch_size self.min_tokens = hparams.min_tokens self.min_n_frame = hparams.reduction_factor * hparams.min_iters self.max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor self.skip_path_filter = config.skip_path_filter # Load metadata: self.path_dict = get_path_dict( data_dirs, self._hp, config, self.data_type, n_test=self.batch_size, rng=self.rng) self.data_dirs = list(self.path_dict.keys()) self.data_dir_to_id = { data_dir: idx for idx, data_dir in enumerate(self.data_dirs)} data_weight = { data_dir: 1. for data_dir in self.data_dirs } if self._hp.main_data_greedy_factor > 0 and \ any(main_data in data_dir for data_dir in self.data_dirs \ for main_data in self._hp.main_data): for main_data in self._hp.main_data: for data_dir in self.data_dirs: if main_data in data_dir: data_weight[data_dir] += self._hp.main_data_greedy_factor weight_Z = sum(data_weight.values()) self.data_ratio = { data_dir: weight / weight_Z for data_dir, weight in data_weight.items() } log("="*40) log(pprint.pformat(self.data_ratio, indent=4)) log("="*40) #audio_paths = [path.replace("/data/", "/audio/"). \ # replace(".npz", ".wav") for path in self.data_paths] #duration = get_durations(audio_paths, print_detail=False) # Create placeholders for inputs and targets. Don't specify batch size because we want to # be able to feed different sized batches at eval time. self._placeholders = [ tf.placeholder(tf.int32, [None, None], 'inputs'), tf.placeholder(tf.int32, [None], 'input_lengths'), tf.placeholder(tf.float32, [None], 'loss_coeff'), tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'), tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets'), ] # Create queue for buffering data: dtypes = [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32] self.is_multi_speaker = len(self.data_dirs) > 1 if self.is_multi_speaker: self._placeholders.append( tf.placeholder(tf.int32, [None], 'inputs'), ) dtypes.append(tf.int32) num_worker = 8 if self.data_type == 'train' else 1 queue = tf.FIFOQueue(num_worker, dtypes, name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) if self.is_multi_speaker: self.inputs, self.input_lengths, self.loss_coeff, \ self.mel_targets, self.linear_targets, self.speaker_id = queue.dequeue() else: self.inputs, self.input_lengths, self.loss_coeff, \ self.mel_targets, self.linear_targets = queue.dequeue() self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.loss_coeff.set_shape(self._placeholders[2].shape) self.mel_targets.set_shape(self._placeholders[3].shape) self.linear_targets.set_shape(self._placeholders[4].shape) if self.is_multi_speaker: self.speaker_id.set_shape(self._placeholders[5].shape) else: self.speaker_id = None if self.data_type == 'test': examples = [] while True: for data_dir in self.data_dirs: examples.append(self._get_next_example(data_dir)) #print(data_dir, text.sequence_to_text(examples[-1][0], False, True)) if len(examples) >= self.batch_size: break if len(examples) >= self.batch_size: break self.static_batches = [examples for _ in range(self._batches_per_group)] else: self.static_batches = None def start_in_session(self, session, start_step): self._step = start_step self._session = session self.start() def run(self): try: while not self._coord.should_stop(): self._enqueue_next_group() except Exception as e: traceback.print_exc() self._coord.request_stop(e) def _enqueue_next_group(self): start = time.time() # Read a group of examples: n = self.batch_size r = self._hp.reduction_factor if self.static_batches is not None: batches = self.static_batches else: examples = [] for data_dir in self.data_dirs: if self._hp.initial_data_greedy: if self._step < self._hp.initial_phase_step and \ any("krbook" in data_dir for data_dir in self.data_dirs): data_dir = [data_dir for data_dir in self.data_dirs if "krbook" in data_dir][0] if self._step < self._hp.initial_phase_step: example = [self._get_next_example(data_dir) \ for _ in range(int(n * self._batches_per_group // len(self.data_dirs)))] else: example = [self._get_next_example(data_dir) \ for _ in range(int(n * self._batches_per_group * self.data_ratio[data_dir]))] examples.extend(example) examples.sort(key=lambda x: x[-1]) batches = [examples[i:i+n] for i in range(0, len(examples), n)] self.rng.shuffle(batches) log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r, self.rng, self.data_type))) self._session.run(self._enqueue_op, feed_dict=feed_dict) self._step += 1 def _get_next_example(self, data_dir): '''Loads a single example (input, mel_target, linear_target, cost) from disk''' data_paths = self.path_dict[data_dir] while True: if self._offset[data_dir] >= len(data_paths): self._offset[data_dir] = 0 if self.data_type == 'train': self.rng.shuffle(data_paths) data_path = data_paths[self._offset[data_dir]] self._offset[data_dir] += 1 try: if os.path.exists(data_path): data = np.load(data_path) else: continue except: remove_file(data_path) continue if not self.skip_path_filter: break if self.min_n_frame <= data["linear"].shape[0] <= self.max_n_frame and \ len(data["tokens"]) > self.min_tokens: break input_data = data['tokens'] mel_target = data['mel'] if 'loss_coeff' in data: loss_coeff = data['loss_coeff'] else: loss_coeff = 1 linear_target = data['linear'] return (input_data, loss_coeff, mel_target, linear_target, self.data_dir_to_id[data_dir], len(linear_target)) def _prepare_batch(batch, reduction_factor, rng, data_type=None): if data_type == 'train': rng.shuffle(batch) inputs = _prepare_inputs([x[0] for x in batch]) input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32) loss_coeff = np.asarray([x[1] for x in batch], dtype=np.float32) mel_targets = _prepare_targets([x[2] for x in batch], reduction_factor) linear_targets = _prepare_targets([x[3] for x in batch], reduction_factor) if len(batch[0]) == 6: speaker_id = np.asarray([x[4] for x in batch], dtype=np.int32) return (inputs, input_lengths, loss_coeff, mel_targets, linear_targets, speaker_id) else: return (inputs, input_lengths, loss_coeff, mel_targets, linear_targets) def _prepare_inputs(inputs): max_len = max((len(x) for x in inputs)) return np.stack([_pad_input(x, max_len) for x in inputs]) def _prepare_targets(targets, alignment): max_len = max((len(t) for t in targets)) + 1 return np.stack([_pad_target(t, _round_up(max_len, alignment)) for t in targets]) def _pad_input(x, length): return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad) def _pad_target(t, length): return np.pad(t, [(0, length - t.shape[0]), (0,0)], mode='constant', constant_values=_pad) def _round_up(x, multiple): remainder = x % multiple return x if remainder == 0 else x + multiple - remainder