# Code based on https://github.com/keithito/tacotron/blob/master/datasets/ljspeech.py import os import re import sys import json import argparse import numpy as np from tqdm import tqdm from glob import glob from functools import partial from collections import Counter, defaultdict from concurrent.futures import ProcessPoolExecutor import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt from hparams import hparams from text import text_to_sequence from utils import makedirs, remove_file, warning from audio import load_audio, spectrogram, melspectrogram, frames_to_hours def one(x=None): return 1 def build_from_path(config): warning("Sampling rate: {}".format(hparams.sample_rate)) executor = ProcessPoolExecutor(max_workers=config.num_workers) futures = [] index = 1 base_dir = os.path.dirname(config.metadata_path) data_dir = os.path.join(base_dir, config.data_dirname) makedirs(data_dir) loss_coeff = defaultdict(one) if config.metadata_path.endswith("json"): with open(config.metadata_path) as f: content = f.read() info = json.loads(content) elif config.metadata_path.endswith("csv"): with open(config.metadata_path) as f: info = {} for line in f: path, text = line.strip().split('|') info[path] = text else: raise Exception(" [!] Unkown metadata format: {}".format(config.metadata_path)) new_info = {} for path in info.keys(): if not os.path.exists(path): new_path = os.path.join(base_dir, path) if not os.path.exists(new_path): print(" [!] Audio not found: {}".format([path, new_path])) continue else: new_path = path new_info[new_path] = info[path] info = new_info for path in info.keys(): if type(info[path]) == list: if hparams.ignore_recognition_level == 1 and len(info[path]) == 1 or \ hparams.ignore_recognition_level == 2: loss_coeff[path] = hparams.recognition_loss_coeff info[path] = info[path][0] ignore_description = { 0: "use all", 1: "ignore only unmatched_alignment", 2: "fully ignore recognitio", } print(" [!] Skip recognition level: {} ({})". \ format(hparams.ignore_recognition_level, ignore_description[hparams.ignore_recognition_level])) for audio_path, text in info.items(): if hparams.ignore_recognition_level > 0 and loss_coeff[audio_path] != 1: continue if base_dir not in audio_path: audio_path = os.path.join(base_dir, audio_path) try: tokens = text_to_sequence(text) except: continue fn = partial( _process_utterance, audio_path, data_dir, tokens, loss_coeff[audio_path]) futures.append(executor.submit(fn)) n_frames = [future.result() for future in tqdm(futures)] n_frames = [n_frame for n_frame in n_frames if n_frame is not None] hours = frames_to_hours(n_frames) print(' [*] Loaded metadata for {} examples ({:.2f} hours)'.format(len(n_frames), hours)) print(' [*] Max length: {}'.format(max(n_frames))) print(' [*] Min length: {}'.format(min(n_frames))) plot_n_frames(n_frames, os.path.join( base_dir, "n_frames_before_filter.png")) min_n_frame = hparams.reduction_factor * hparams.min_iters max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor n_frames = [n for n in n_frames if min_n_frame <= n <= max_n_frame] hours = frames_to_hours(n_frames) print(' [*] After filtered: {} examples ({:.2f} hours)'.format(len(n_frames), hours)) print(' [*] Max length: {}'.format(max(n_frames))) print(' [*] Min length: {}'.format(min(n_frames))) plot_n_frames(n_frames, os.path.join( base_dir, "n_frames_after_filter.png")) def plot_n_frames(n_frames, path): labels, values = list(zip(*Counter(n_frames).most_common())) values = [v for _, v in sorted(zip(labels, values))] labels = sorted(labels) indexes = np.arange(len(labels)) width = 1 fig, ax = plt.subplots(figsize=(len(labels) / 2, 5)) plt.bar(indexes, values, width) plt.xticks(indexes + width * 0.5, labels) plt.tight_layout() plt.savefig(path) def _process_utterance(audio_path, data_dir, tokens, loss_coeff): audio_name = os.path.basename(audio_path) filename = audio_name.rsplit('.', 1)[0] + ".npz" numpy_path = os.path.join(data_dir, filename) if not os.path.exists(numpy_path): wav = load_audio(audio_path) linear_spectrogram = spectrogram(wav).astype(np.float32) mel_spectrogram = melspectrogram(wav).astype(np.float32) data = { "linear": linear_spectrogram.T, "mel": mel_spectrogram.T, "tokens": tokens, "loss_coeff": loss_coeff, } n_frame = linear_spectrogram.shape[1] if hparams.skip_inadequate: min_n_frame = hparams.reduction_factor * hparams.min_iters max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor if min_n_frame <= n_frame <= max_n_frame and len(tokens) >= hparams.min_tokens: return None np.savez(numpy_path, **data, allow_pickle=False) else: try: data = np.load(numpy_path) n_frame = data["linear"].shape[0] except: remove_file(numpy_path) return _process_utterance(audio_path, data_dir, tokens, loss_coeff) return n_frame if __name__ == '__main__': parser = argparse.ArgumentParser(description='spectrogram') parser.add_argument('metadata_path', type=str) parser.add_argument('--data_dirname', type=str, default="data") parser.add_argument('--num_workers', type=int, default=None) config = parser.parse_args() build_from_path(config)