multi-speaker-tacotron-tens.../audio/google_speech.py

521 lines
17 KiB
Python

import io
import os
import sys
import json
import string
import argparse
import operator
import numpy as np
from glob import glob
from tqdm import tqdm
from nltk import ngrams
from difflib import SequenceMatcher
from collections import defaultdict
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
from utils import parallel_run
from text import text_to_sequence
####################################################
# When one or two audio is missed in the middle
####################################################
def get_continuous_audio_paths(paths, debug=False):
audio_ids = get_audio_ids_from_paths(paths)
min_id, max_id = min(audio_ids), max(audio_ids)
if int(max_id) - int(min_id) + 1 != len(audio_ids):
base_path = paths[0].replace(min_id, "{:0" + str(len(max_id)) + "d}")
new_paths = [
base_path.format(audio_id) \
for audio_id in range(int(min_id), int(max_id) + 1)]
if debug: print("Missing audio : {} -> {}".format(paths, new_paths))
return new_paths
else:
return paths
def get_argmax_key(info, with_value=False):
max_key = max(info.keys(), key=(lambda k: info[k]))
if with_value:
return max_key, info[max_key]
else:
return max_key
def similarity(text_a, text_b):
text_a = "".join(remove_puncuations(text_a.strip()).split())
text_b = "".join(remove_puncuations(text_b.strip()).split())
score = SequenceMatcher(None, text_a, text_b).ratio()
#score = 1 / (distance(decompose_ko_text(text_a), decompose_ko_text(text_b)) + 1e-5)
#score = SequenceMatcher(None,
# decompose_ko_text(text_a), decompose_ko_text(text_b)).ratio()
if len(text_a) < len(text_b):
return -1 + score
else:
return score
def get_key_value_sorted(data):
keys = list(data.keys())
keys.sort()
values = [data[key] for key in keys]
return keys, values
def replace_pred_with_book(
path, book_path=None, threshold=0.9, max_candidate_num=5,
min_post_char_check=2, max_post_char_check=7, max_n=5,
max_allow_missing_when_matching=4, debug=False):
#######################################
# find text book from pred
#######################################
if book_path is None:
book_path = path.replace("speech", "text").replace("json", "txt")
data = json.loads(open(path).read())
keys, preds = get_key_value_sorted(data)
book_words = [word for word in open(book_path).read().split() if word != "=="]
book_texts = [text.replace('\n', '') for text in open(book_path).readlines()]
loc = 0
prev_key = None
force_stop = False
prev_end_loc = -1
prev_sentence_ended = True
prev_empty_skip = False
prev_not_found_skip = False
black_lists = ["160.{:04d}".format(audio_id) for audio_id in range(20, 36)]
new_preds = {}
for key, pred in zip(keys, preds):
if debug: print(key, pred)
if pred == "" or key in black_lists:
prev_empty_skip = True
continue
width, counter = 1, 0
sim_dict, loc_dict = {}, {}
while True:
words = book_words[loc:loc + width]
if len(words) == 0:
print("Force stop. Left {}, Del {} {}". \
format(len(preds) - len(new_preds), new_preds[prev_key], prev_key))
new_preds.pop(prev_key, None)
force_stop = True
break
candidate_candidates = {}
for _pred in list(set([pred, koreanize_numbers(pred)])):
max_skip = 0 if has_number(_pred[0]) or \
_pred[0] in """"'“”’‘’""" else len(words)
end_sims = []
for idx in range(min(max_skip, 10)):
text = " ".join(words[idx:])
################################################
# Score of trailing sentence is also important
################################################
for jdx in range(min_post_char_check,
max_post_char_check):
sim = similarity(
"".join(_pred.split())[-jdx:],
"".join(text.split())[-jdx:])
end_sims.append(sim)
candidate_candidates[text] = similarity(_pred, text)
candidate, sim = get_argmax_key(
candidate_candidates, with_value=True)
if sim > threshold or max(end_sims + [-1]) > threshold - 0.2 or \
len(sim_dict) > 0:
sim_dict[candidate] = sim
loc_dict[candidate] = loc + width
if len(sim_dict) > 0:
counter += 1
if counter > max_candidate_num:
break
width += 1
if width - len(_pred.split()) > 5:
break
if force_stop:
break
if len(sim_dict) != 0:
#############################################################
# Check missing words between prev pred and current pred
#############################################################
if prev_key is not None:
cur_idx = int(key.rsplit('.', 2)[-2])
prev_idx = int(prev_key.rsplit('.', 2)[-2])
if cur_idx - prev_idx > 10:
force_stop = True
break
# word alinged based on prediction but may contain missing words
# because google speech recognition sometimes skip one or two word
# ex. ('오누이는 서로 자기가 할 일을 정했다.', '서로 자기가 할 일을 정했다.')
original_candidate = new_candidate = get_argmax_key(sim_dict)
word_to_find = original_candidate.split()[0]
if not prev_empty_skip:
search_idx = book_words[prev_end_loc:].index(word_to_find) \
if word_to_find in book_words[prev_end_loc:] else -1
if 0 < search_idx < 4 and not prev_sentence_ended:
words_to_check = book_words[prev_end_loc:prev_end_loc + search_idx]
if ends_with_punctuation(words_to_check[0]) == True:
tmp = " ".join([new_preds[prev_key]] + words_to_check[:1])
if debug: print(prev_key, tmp, new_preds[prev_key])
new_preds[prev_key] = tmp
prev_end_loc += 1
prev_sentence_ended = True
search_idx = book_words[prev_end_loc:].index(word_to_find) \
if word_to_find in book_words[prev_end_loc:] else -1
if 0 < search_idx < 4 and prev_sentence_ended:
words_to_check = book_words[prev_end_loc:prev_end_loc + search_idx]
if not any(ends_with_punctuation(word) for word in words_to_check):
new_candidate = " ".join(words_to_check + [original_candidate])
if debug: print(key, new_candidate, original_candidate)
new_preds[key] = new_candidate
prev_sentence_ended = ends_with_punctuation(new_candidate)
loc = loc_dict[original_candidate]
prev_key = key
prev_not_found_skip = False
else:
loc += len(_pred.split()) - 1
prev_sentence_ended = True
prev_not_found_skip = True
prev_end_loc = loc
prev_empty_skip = False
if debug:
print("=", pred)
print("=", new_preds[key], loc)
if force_stop:
print(" [!] Force stop: {}".format(path))
align_diff = loc - len(book_words)
if abs(align_diff) > 10:
print(" => Align result of {}: {} - {} = {}".format(path, loc, len(book_words), align_diff))
#######################################
# find exact match of n-gram of pred
#######################################
finished_ids = []
keys, preds = get_key_value_sorted(new_preds)
if abs(align_diff) > 10:
keys, preds = keys[:-30], preds[:-30]
unfinished_ids = range(len(keys))
text_matches = []
for n in range(max_n, 1, -1):
ngram_preds = ngrams(preds, n)
for n_allow_missing in range(0, max_allow_missing_when_matching + 1):
unfinished_ids = list(set(unfinished_ids) - set(finished_ids))
existing_ngram_preds = []
for ngram in ngram_preds:
for text in book_texts:
candidates = [
" ".join(text.split()[:-n_allow_missing]),
" ".join(text.split()[n_allow_missing:]),
]
for tmp_text in candidates:
if " ".join(ngram) == tmp_text:
existing_ngram_preds.append(ngram)
break
tmp_keys = []
cur_ngram = []
ngram_idx = 0
ngram_found = False
for id_idx in unfinished_ids:
key, pred = keys[id_idx], preds[id_idx]
if ngram_idx >= len(existing_ngram_preds):
break
cur_ngram = existing_ngram_preds[ngram_idx]
if pred in cur_ngram:
ngram_found = True
tmp_keys.append(key)
finished_ids.append(id_idx)
if len(tmp_keys) == len(cur_ngram):
if debug: print(n_allow_missing, tmp_keys, cur_ngram)
tmp_keys = get_continuous_audio_paths(tmp_keys, debug)
text_matches.append(
[[" ".join(cur_ngram)], tmp_keys]
)
ngram_idx += 1
tmp_keys = []
cur_ngram = []
else:
if pred == cur_ngram[-1]:
ngram_idx += 1
tmp_keys = []
cur_ngram = []
else:
if len(tmp_keys) > 0:
ngram_found = False
tmp_keys = []
cur_ngram = []
for id_idx in range(len(keys)):
if id_idx not in finished_ids:
key, pred = keys[id_idx], preds[id_idx]
text_matches.append(
[[pred], [key]]
)
##############################################################
# ngram again for just in case after adding missing words
##############################################################
max_keys = [max(get_audio_ids_from_paths(item[1], as_int=True)) for item in text_matches]
sorted_text_matches = \
[item for _, item in sorted(zip(max_keys, text_matches))]
preds = [item[0][0] for item in sorted_text_matches]
keys = [item[1] for item in sorted_text_matches]
def book_sentence_idx_search(query, book_texts):
for idx, text in enumerate(book_texts):
if query in text:
return idx, text
return False, False
text_matches = []
idx, book_cursor_idx = 0, 0
if len(preds) == 0:
return []
while True:
tmp_texts = book_texts[book_cursor_idx:]
jdx = 0
tmp_pred = preds[idx]
idxes_to_merge = [idx]
prev_sent_idx, prev_sent = book_sentence_idx_search(tmp_pred, tmp_texts)
while idx + jdx + 1 < len(preds):
jdx += 1
tmp_pred = preds[idx + jdx]
sent_idx, sent = book_sentence_idx_search(tmp_pred, tmp_texts)
if not sent_idx:
if debug: print(" [!] NOT FOUND: {}".format(tmp_pred))
break
if prev_sent_idx == sent_idx:
idxes_to_merge.append(idx + jdx)
else:
break
new_keys = get_continuous_audio_paths(
sum([keys[jdx] for jdx in idxes_to_merge], []))
text_matches.append([ [tmp_texts[prev_sent_idx]], new_keys ])
if len(new_keys) > 1:
book_cursor_idx += 1
book_cursor_idx = max(book_cursor_idx, sent_idx)
if idx == len(preds) - 1:
break
idx = idx + jdx
# Counter([len(i) for i in text_matches.values()])
return text_matches
def get_text_from_audio_batch(paths, multi_process=False):
results = {}
items = parallel_run(get_text_from_audio, paths,
desc="get_text_from_audio_batch")
for item in items:
results.update(item)
return results
def get_text_from_audio(path):
error_count = 0
txt_path = path.replace('flac', 'txt')
if os.path.exists(txt_path):
with open(txt_path) as f:
out = json.loads(open(txt_path).read())
return out
out = {}
while True:
try:
client = speech.SpeechClient()
with io.open(path, 'rb') as audio_file:
content = audio_file.read()
audio = types.RecognitionAudio(content=content)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
sample_rate_hertz=16000,
language_code='ko-KR')
response = client.recognize(config, audio)
if len(response.results) > 0:
alternatives = response.results[0].alternatives
results = [alternative.transcript for alternative in alternatives]
assert len(results) == 1, "More than 1 results: {}".format(results)
out = { path: "" if len(results) == 0 else results[0] }
print(results[0])
break
break
except:
error_count += 1
print("Skip warning for {} for {} times". \
format(path, error_count))
if error_count > 5:
break
else:
continue
with open(txt_path, 'w') as f:
json.dump(out, f, indent=2, ensure_ascii=False)
return out
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--asset-dir', type=str, default='assets')
parser.add_argument('--data-dir', type=str, default='audio')
parser.add_argument('--pattern', type=str, default="audio/*.flac")
parser.add_argument('--metadata', type=str, default="metadata.json")
config, unparsed = parser.parse_known_args()
paths = glob(config.pattern)
paths.sort()
paths = paths
book_ids = list(set([
os.path.basename(path).split('.', 1)[0] for path in paths]))
book_ids.sort()
def get_finished_ids():
finished_paths = glob(os.path.join(
config.asset_dir, "speech-*.json"))
finished_ids = list(set([
os.path.basename(path).split('.', 1)[0].replace("speech-", "") for path in finished_paths]))
finished_ids.sort()
return finished_ids
finished_ids = get_finished_ids()
print("# Finished : {}/{}".format(len(finished_ids), len(book_ids)))
book_ids_to_parse = list(set(book_ids) - set(finished_ids))
book_ids_to_parse.sort()
assert os.path.exists(config.asset_dir), "assert_dir not found"
pbar = tqdm(book_ids_to_parse, "[1] google_speech",
initial=len(finished_ids), total=len(book_ids))
for book_id in pbar:
current_paths = glob(config.pattern.replace("*", "{}.*".format(book_id)))
pbar.set_description("[1] google_speech : {}".format(book_id))
results = get_text_from_audio_batch(current_paths)
filename = "speech-{}.json".format(book_id)
path = os.path.join(config.asset_dir, filename)
with open(path, "w") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
finished_ids = get_finished_ids()
for book_id in tqdm(finished_ids, "[2] text_match"):
filename = "speech-{}.json".format(book_id)
path = os.path.join(config.asset_dir, filename)
clean_path = path.replace("speech", "clean-speech")
if os.path.exists(clean_path):
print(" [*] Skip {}".format(clean_path))
else:
results = replace_pred_with_book(path)
with open(clean_path, "w") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
# Dummy
if False:
match_paths = get_paths_by_pattern(
config.asset_dir, 'clean-speech-*.json')
metadata_path = os.path.join(config.data_dir, config.metadata)
print(" [3] Merge clean-speech-*.json into {}".format(metadata_path))
merged_data = []
for path in match_paths:
with open(path) as f:
merged_data.extend(json.loads(f.read()))
import ipdb; ipdb.set_trace()
with open(metadata_path, 'w') as f:
json.dump(merged_data, f, indent=2, ensure_ascii=False)