# Code based on https://github.com/keithito/tacotron/blob/master/models/tacotron.py import tensorflow as tf from tensorflow.contrib.rnn import GRUCell from tensorflow.python.layers import core from tensorflow.contrib.seq2seq.python.ops.attention_wrapper \ import _bahdanau_score, _BaseAttentionMechanism, BahdanauAttention, \ AttentionWrapper, AttentionWrapperState def get_embed(inputs, num_inputs, embed_size, name): embed_table = tf.get_variable( name, [num_inputs, embed_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.1)) return tf.nn.embedding_lookup(embed_table, inputs) def prenet(inputs, is_training, layer_sizes, drop_prob, scope=None): x = inputs drop_rate = drop_prob if is_training else 0.0 with tf.variable_scope(scope or 'prenet'): for i, size in enumerate(layer_sizes): dense = tf.layers.dense(x, units=size, activation=tf.nn.relu, name='dense_%d' % (i+1)) x = tf.layers.dropout(dense, rate=drop_rate, name='dropout_%d' % (i+1)) return x def cbhg(inputs, input_lengths, is_training, bank_size, bank_channel_size, maxpool_width, highway_depth, rnn_size, proj_sizes, proj_width, scope, before_highway=None, encoder_rnn_init_state=None): batch_size = tf.shape(inputs)[0] with tf.variable_scope(scope): with tf.variable_scope('conv_bank'): # Convolution bank: concatenate on the last axis # to stack channels from all convolutions conv_fn = lambda k: \ conv1d(inputs, k, bank_channel_size, tf.nn.relu, is_training, 'conv1d_%d' % k) conv_outputs = tf.concat( [conv_fn(k) for k in range(1, bank_size+1)], axis=-1, ) # Maxpooling: maxpool_output = tf.layers.max_pooling1d( conv_outputs, pool_size=maxpool_width, strides=1, padding='same') # Two projection layers: proj_out = maxpool_output for idx, proj_size in enumerate(proj_sizes): activation_fn = None if idx == len(proj_sizes) - 1 else tf.nn.relu proj_out = conv1d( proj_out, proj_width, proj_size, activation_fn, is_training, 'proj_{}'.format(idx + 1)) # Residual connection: if before_highway is not None: expanded_before_highway = tf.expand_dims(before_highway, [1]) tiled_before_highway = tf.tile( expanded_before_highway, [1, tf.shape(proj_out)[1], 1]) highway_input = proj_out + inputs + tiled_before_highway else: highway_input = proj_out + inputs # Handle dimensionality mismatch: if highway_input.shape[2] != rnn_size: highway_input = tf.layers.dense(highway_input, rnn_size) # 4-layer HighwayNet: for idx in range(highway_depth): highway_input = highwaynet(highway_input, 'highway_%d' % (idx+1)) rnn_input = highway_input # Bidirectional RNN if encoder_rnn_init_state is not None: initial_state_fw, initial_state_bw = \ tf.split(encoder_rnn_init_state, 2, 1) else: initial_state_fw, initial_state_bw = None, None cell_fw, cell_bw = GRUCell(rnn_size), GRUCell(rnn_size) outputs, states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, rnn_input, sequence_length=input_lengths, initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, dtype=tf.float32) return tf.concat(outputs, axis=2) # Concat forward and backward def batch_tile(tensor, batch_size): expaneded_tensor = tf.expand_dims(tensor, [0]) return tf.tile(expaneded_tensor, \ [batch_size] + [1 for _ in tensor.get_shape()]) def highwaynet(inputs, scope): highway_dim = int(inputs.get_shape()[-1]) with tf.variable_scope(scope): H = tf.layers.dense( inputs, units=highway_dim, activation=tf.nn.relu, name='H') T = tf.layers.dense( inputs, units=highway_dim, activation=tf.nn.sigmoid, name='T', bias_initializer=tf.constant_initializer(-1.0)) return H * T + inputs * (1.0 - T) def conv1d(inputs, kernel_size, channels, activation, is_training, scope): with tf.variable_scope(scope): conv1d_output = tf.layers.conv1d( inputs, filters=channels, kernel_size=kernel_size, activation=activation, padding='same') return tf.layers.batch_normalization(conv1d_output, training=is_training)