multi-speaker-tacotron-tens.../models/modules.py

132 lines
4.8 KiB
Python

# Code based on https://github.com/keithito/tacotron/blob/master/models/tacotron.py
import tensorflow as tf
from tensorflow.contrib.rnn import GRUCell
from tensorflow.python.layers import core
from tensorflow.contrib.seq2seq.python.ops.attention_wrapper \
import _bahdanau_score, _BaseAttentionMechanism, BahdanauAttention, \
AttentionWrapper, AttentionWrapperState
def get_embed(inputs, num_inputs, embed_size, name):
embed_table = tf.get_variable(
name, [num_inputs, embed_size], dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.1))
return tf.nn.embedding_lookup(embed_table, inputs)
def prenet(inputs, is_training, layer_sizes, drop_prob, scope=None):
x = inputs
drop_rate = drop_prob if is_training else 0.0
with tf.variable_scope(scope or 'prenet'):
for i, size in enumerate(layer_sizes):
dense = tf.layers.dense(x, units=size, activation=tf.nn.relu, name='dense_%d' % (i+1))
x = tf.layers.dropout(dense, rate=drop_rate, name='dropout_%d' % (i+1))
return x
def cbhg(inputs, input_lengths, is_training,
bank_size, bank_channel_size,
maxpool_width, highway_depth, rnn_size,
proj_sizes, proj_width, scope,
before_highway=None, encoder_rnn_init_state=None):
batch_size = tf.shape(inputs)[0]
with tf.variable_scope(scope):
with tf.variable_scope('conv_bank'):
# Convolution bank: concatenate on the last axis
# to stack channels from all convolutions
conv_fn = lambda k: \
conv1d(inputs, k, bank_channel_size,
tf.nn.relu, is_training, 'conv1d_%d' % k)
conv_outputs = tf.concat(
[conv_fn(k) for k in range(1, bank_size+1)], axis=-1,
)
# Maxpooling:
maxpool_output = tf.layers.max_pooling1d(
conv_outputs,
pool_size=maxpool_width,
strides=1,
padding='same')
# Two projection layers:
proj_out = maxpool_output
for idx, proj_size in enumerate(proj_sizes):
activation_fn = None if idx == len(proj_sizes) - 1 else tf.nn.relu
proj_out = conv1d(
proj_out, proj_width, proj_size, activation_fn,
is_training, 'proj_{}'.format(idx + 1))
# Residual connection:
if before_highway is not None:
expanded_before_highway = tf.expand_dims(before_highway, [1])
tiled_before_highway = tf.tile(
expanded_before_highway, [1, tf.shape(proj_out)[1], 1])
highway_input = proj_out + inputs + tiled_before_highway
else:
highway_input = proj_out + inputs
# Handle dimensionality mismatch:
if highway_input.shape[2] != rnn_size:
highway_input = tf.layers.dense(highway_input, rnn_size)
# 4-layer HighwayNet:
for idx in range(highway_depth):
highway_input = highwaynet(highway_input, 'highway_%d' % (idx+1))
rnn_input = highway_input
# Bidirectional RNN
if encoder_rnn_init_state is not None:
initial_state_fw, initial_state_bw = \
tf.split(encoder_rnn_init_state, 2, 1)
else:
initial_state_fw, initial_state_bw = None, None
cell_fw, cell_bw = GRUCell(rnn_size), GRUCell(rnn_size)
outputs, states = tf.nn.bidirectional_dynamic_rnn(
cell_fw, cell_bw,
rnn_input,
sequence_length=input_lengths,
initial_state_fw=initial_state_fw,
initial_state_bw=initial_state_bw,
dtype=tf.float32)
return tf.concat(outputs, axis=2) # Concat forward and backward
def batch_tile(tensor, batch_size):
expaneded_tensor = tf.expand_dims(tensor, [0])
return tf.tile(expaneded_tensor, \
[batch_size] + [1 for _ in tensor.get_shape()])
def highwaynet(inputs, scope):
highway_dim = int(inputs.get_shape()[-1])
with tf.variable_scope(scope):
H = tf.layers.dense(
inputs,
units=highway_dim,
activation=tf.nn.relu,
name='H')
T = tf.layers.dense(
inputs,
units=highway_dim,
activation=tf.nn.sigmoid,
name='T',
bias_initializer=tf.constant_initializer(-1.0))
return H * T + inputs * (1.0 - T)
def conv1d(inputs, kernel_size, channels, activation, is_training, scope):
with tf.variable_scope(scope):
conv1d_output = tf.layers.conv1d(
inputs,
filters=channels,
kernel_size=kernel_size,
activation=activation,
padding='same')
return tf.layers.batch_normalization(conv1d_output, training=is_training)