multi-speaker-tacotron-tens.../audio/google_speech.py

import io
import os
import sys
import json
import string
import argparse
import operator
import numpy as np
from glob import glob
from tqdm import tqdm
from nltk import ngrams
from difflib import SequenceMatcher
from collections import defaultdict

from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types

from utils import parallel_run
from text import text_to_sequence

####################################################
# When one or two audio is missed in the middle
####################################################

def get_continuous_audio_paths(paths, debug=False):
    audio_ids = get_audio_ids_from_paths(paths)
    min_id, max_id = min(audio_ids), max(audio_ids)

    if int(max_id) - int(min_id) + 1 != len(audio_ids):
        base_path = paths[0].replace(min_id, "{:0" + str(len(max_id)) + "d}")
        new_paths = [
                base_path.format(audio_id) \
                        for audio_id in range(int(min_id), int(max_id) + 1)]

        if debug: print("Missing audio : {} -> {}".format(paths, new_paths))
        return new_paths
    else:
        return paths

def get_argmax_key(info, with_value=False):
    max_key = max(info.keys(), key=(lambda k: info[k]))

    if with_value:
        return max_key, info[max_key]
    else:
        return max_key

def similarity(text_a, text_b):
    text_a = "".join(remove_puncuations(text_a.strip()).split())
    text_b = "".join(remove_puncuations(text_b.strip()).split())

    score = SequenceMatcher(None, text_a, text_b).ratio()
    #score = 1 / (distance(decompose_ko_text(text_a), decompose_ko_text(text_b)) + 1e-5)
    #score = SequenceMatcher(None,
    #        decompose_ko_text(text_a), decompose_ko_text(text_b)).ratio()

    if len(text_a) < len(text_b):
        return -1 + score
    else:
        return score

def get_key_value_sorted(data):
    keys = list(data.keys())
    keys.sort()
    values = [data[key] for key in keys]
    return keys, values

def replace_pred_with_book(
        path, book_path=None, threshold=0.9, max_candidate_num=5,
        min_post_char_check=2, max_post_char_check=7, max_n=5,
        max_allow_missing_when_matching=4, debug=False):

    #######################################
    # find text book from pred
    #######################################

    if book_path is None:
        book_path = path.replace("speech", "text").replace("json", "txt")

    data = json.loads(open(path).read())

    keys, preds = get_key_value_sorted(data)

    book_words = [word for word in open(book_path).read().split() if word != "=="]
    book_texts = [text.replace('\n', '') for text in open(book_path).readlines()]

    loc = 0
    prev_key = None
    force_stop = False
    prev_end_loc = -1
    prev_sentence_ended = True

    prev_empty_skip = False
    prev_not_found_skip = False

    black_lists = ["160.{:04d}".format(audio_id) for audio_id in range(20, 36)]

    new_preds = {}
    for key, pred in zip(keys, preds):
        if debug: print(key, pred)

        if pred == "" or key in black_lists:
            prev_empty_skip = True
            continue

        width, counter = 1, 0
        sim_dict, loc_dict = {}, {}

        while True:
            words = book_words[loc:loc + width]

            if len(words) == 0:
                print("Force stop. Left {}, Del {} {}". \
                        format(len(preds) - len(new_preds), new_preds[prev_key], prev_key))
                new_preds.pop(prev_key, None)
                force_stop = True
                break

            candidate_candidates = {}

            for _pred in list(set([pred, koreanize_numbers(pred)])):
                max_skip = 0 if has_number(_pred[0]) or \
                        _pred[0] in """"'“”’‘’""" else len(words)

                end_sims = []
                for idx in range(min(max_skip, 10)):
                    text = " ".join(words[idx:])

                    ################################################
                    # Score of trailing sentence is also important
                    ################################################

                    for jdx in range(min_post_char_check,
                                     max_post_char_check):
                        sim = similarity(
                                "".join(_pred.split())[-jdx:],
                                "".join(text.split())[-jdx:])
                        end_sims.append(sim)

                    candidate_candidates[text] = similarity(_pred, text)

            candidate, sim = get_argmax_key(
                    candidate_candidates, with_value=True)

            if sim > threshold or max(end_sims + [-1]) > threshold - 0.2 or \
                    len(sim_dict) > 0:
                sim_dict[candidate] = sim
                loc_dict[candidate] = loc + width

            if len(sim_dict) > 0:
                counter += 1

            if counter > max_candidate_num:
                break

            width += 1

            if width - len(_pred.split()) > 5:
                break

        if force_stop:
            break

        if len(sim_dict) != 0:
            #############################################################
            # Check missing words between prev pred and current pred
            #############################################################

            if prev_key is not None:
                cur_idx = int(key.rsplit('.', 2)[-2])
                prev_idx = int(prev_key.rsplit('.', 2)[-2])

                if cur_idx - prev_idx > 10:
                    force_stop = True
                    break

            # word alinged based on prediction but may contain missing words
            # because google speech recognition sometimes skip one or two word
            # ex. ('오누이는 서로 자기가 할 일을 정했다.', '서로 자기가 할 일을 정했다.')
            original_candidate = new_candidate = get_argmax_key(sim_dict)

            word_to_find = original_candidate.split()[0]

            if not prev_empty_skip:
                search_idx = book_words[prev_end_loc:].index(word_to_find) \
                        if word_to_find in book_words[prev_end_loc:] else -1

                if 0 < search_idx < 4 and not prev_sentence_ended:
                    words_to_check = book_words[prev_end_loc:prev_end_loc + search_idx]

                    if ends_with_punctuation(words_to_check[0]) == True:
                        tmp = " ".join([new_preds[prev_key]] + words_to_check[:1])
                        if debug: print(prev_key, tmp, new_preds[prev_key])
                        new_preds[prev_key] = tmp

                        prev_end_loc += 1
                        prev_sentence_ended = True

                search_idx = book_words[prev_end_loc:].index(word_to_find) \
                        if word_to_find in book_words[prev_end_loc:] else -1

                if 0 < search_idx < 4 and prev_sentence_ended:
                    words_to_check = book_words[prev_end_loc:prev_end_loc + search_idx]

                    if not any(ends_with_punctuation(word) for word in words_to_check):
                        new_candidate = " ".join(words_to_check + [original_candidate])
                        if debug: print(key, new_candidate, original_candidate)

            new_preds[key] = new_candidate
            prev_sentence_ended = ends_with_punctuation(new_candidate)

            loc = loc_dict[original_candidate]
            prev_key = key
            prev_not_found_skip = False
        else:
            loc += len(_pred.split()) - 1
            prev_sentence_ended = True
            prev_not_found_skip = True

        prev_end_loc = loc
        prev_empty_skip = False

        if debug:
            print("=", pred)
            print("=", new_preds[key], loc)

    if force_stop:
        print(" [!] Force stop: {}".format(path))

    align_diff = loc - len(book_words)

    if abs(align_diff) > 10:
        print("   => Align result of {}: {} - {} = {}".format(path, loc, len(book_words), align_diff))

    #######################################
    # find exact match of n-gram of pred
    #######################################

    finished_ids = []

    keys, preds = get_key_value_sorted(new_preds)

    if abs(align_diff) > 10:
        keys, preds = keys[:-30], preds[:-30]

    unfinished_ids = range(len(keys))
    text_matches = []

    for n in range(max_n, 1, -1):
        ngram_preds = ngrams(preds, n)

        for n_allow_missing in range(0, max_allow_missing_when_matching + 1):
            unfinished_ids = list(set(unfinished_ids) - set(finished_ids))

            existing_ngram_preds = []

            for ngram in ngram_preds:
                for text in book_texts:
                    candidates = [
                            " ".join(text.split()[:-n_allow_missing]),
                            " ".join(text.split()[n_allow_missing:]),
                    ]
                    for tmp_text in candidates:
                        if " ".join(ngram) == tmp_text:
                            existing_ngram_preds.append(ngram)
                            break

            tmp_keys = []
            cur_ngram = []

            ngram_idx = 0
            ngram_found = False

            for id_idx in unfinished_ids:
                key, pred = keys[id_idx], preds[id_idx]

                if ngram_idx >= len(existing_ngram_preds):
                    break

                cur_ngram = existing_ngram_preds[ngram_idx]

                if pred in cur_ngram:
                    ngram_found = True

                    tmp_keys.append(key)
                    finished_ids.append(id_idx)

                    if len(tmp_keys) == len(cur_ngram):
                        if debug: print(n_allow_missing, tmp_keys, cur_ngram)

                        tmp_keys = get_continuous_audio_paths(tmp_keys, debug)
                        text_matches.append(
                                [[" ".join(cur_ngram)], tmp_keys]
                        )

                        ngram_idx += 1
                        tmp_keys = []
                        cur_ngram = []
                    else:
                        if pred == cur_ngram[-1]:
                            ngram_idx += 1
                            tmp_keys = []
                            cur_ngram = []
                else:
                    if len(tmp_keys) > 0:
                        ngram_found = False

                        tmp_keys = []
                        cur_ngram = []

    for id_idx in range(len(keys)):
        if id_idx not in finished_ids:
            key, pred = keys[id_idx], preds[id_idx]

            text_matches.append(
                    [[pred], [key]]
            )

    ##############################################################
    # ngram again for just in case after adding missing words
    ##############################################################

    max_keys = [max(get_audio_ids_from_paths(item[1], as_int=True)) for item in text_matches]
    sorted_text_matches = \
            [item for _, item in sorted(zip(max_keys, text_matches))]

    preds = [item[0][0] for item in sorted_text_matches]
    keys = [item[1] for item in sorted_text_matches]

    def book_sentence_idx_search(query, book_texts):
        for idx, text in enumerate(book_texts):
            if query in text:
                return idx, text
        return False, False

    text_matches = []
    idx, book_cursor_idx = 0, 0

    if len(preds) == 0:
        return []

    while True:
        tmp_texts = book_texts[book_cursor_idx:]

        jdx = 0
        tmp_pred = preds[idx]
        idxes_to_merge = [idx]

        prev_sent_idx, prev_sent = book_sentence_idx_search(tmp_pred, tmp_texts)
        while idx + jdx + 1 < len(preds):
            jdx += 1

            tmp_pred = preds[idx + jdx]
            sent_idx, sent = book_sentence_idx_search(tmp_pred, tmp_texts)

            if not sent_idx:
                if debug: print(" [!] NOT FOUND: {}".format(tmp_pred))
                break

            if prev_sent_idx == sent_idx:
                idxes_to_merge.append(idx + jdx)
            else:
                break

        new_keys = get_continuous_audio_paths(
                sum([keys[jdx] for jdx in idxes_to_merge], []))
        text_matches.append([ [tmp_texts[prev_sent_idx]], new_keys ])

        if len(new_keys) > 1:
            book_cursor_idx += 1

        book_cursor_idx = max(book_cursor_idx, sent_idx)

        if idx == len(preds) - 1:
            break
        idx = idx + jdx

    # Counter([len(i) for i in text_matches.values()])
    return text_matches

def get_text_from_audio_batch(paths, multi_process=False):
    results = {}
    items = parallel_run(get_text_from_audio, paths,
            desc="get_text_from_audio_batch")
    for item in items:
        results.update(item)
    return results

def get_text_from_audio(path):
    error_count = 0

    txt_path = path.replace('flac', 'txt')

    if os.path.exists(txt_path):
        with open(txt_path) as f:
            out = json.loads(open(txt_path).read())
            return out

    out = {}
    while True:
        try:
            client = speech.SpeechClient()

            with io.open(path, 'rb') as audio_file:
                content = audio_file.read()
                audio = types.RecognitionAudio(content=content)

            config = types.RecognitionConfig(
                encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
                sample_rate_hertz=16000,
                language_code='ko-KR')

            response = client.recognize(config, audio)
            if len(response.results) > 0:
                alternatives = response.results[0].alternatives

                results = [alternative.transcript for alternative in alternatives]
                assert len(results) == 1, "More than 1 results: {}".format(results)

                out = { path: "" if len(results) == 0 else results[0] }
                print(results[0])
                break
            break
        except:
            error_count += 1
            print("Skip warning for {} for {} times". \
                    format(path, error_count))

            if error_count > 5:
                break
            else:
                continue

    with open(txt_path, 'w') as f:
        json.dump(out, f, indent=2, ensure_ascii=False)

    return out

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--asset-dir', type=str, default='assets')
    parser.add_argument('--data-dir', type=str, default='audio')
    parser.add_argument('--pattern', type=str, default="audio/*.flac")
    parser.add_argument('--metadata', type=str, default="metadata.json")
    config, unparsed = parser.parse_known_args()

    paths = glob(config.pattern)
    paths.sort()
    paths = paths

    book_ids = list(set([
            os.path.basename(path).split('.', 1)[0] for path in paths]))
    book_ids.sort()

    def get_finished_ids():
        finished_paths = glob(os.path.join(
                config.asset_dir, "speech-*.json"))
        finished_ids = list(set([
                os.path.basename(path).split('.', 1)[0].replace("speech-", "") for path in finished_paths]))
        finished_ids.sort()
        return finished_ids

    finished_ids = get_finished_ids()

    print("# Finished : {}/{}".format(len(finished_ids), len(book_ids)))

    book_ids_to_parse = list(set(book_ids) - set(finished_ids))
    book_ids_to_parse.sort()

    assert os.path.exists(config.asset_dir), "assert_dir not found"

    pbar = tqdm(book_ids_to_parse, "[1] google_speech",
            initial=len(finished_ids), total=len(book_ids))

    for book_id in pbar:
        current_paths = glob(config.pattern.replace("*", "{}.*".format(book_id)))
        pbar.set_description("[1] google_speech : {}".format(book_id))

        results = get_text_from_audio_batch(current_paths)

        filename = "speech-{}.json".format(book_id)
        path = os.path.join(config.asset_dir, filename)

        with open(path, "w") as f:
            json.dump(results, f, indent=2, ensure_ascii=False)

    finished_ids = get_finished_ids()

    for book_id in tqdm(finished_ids, "[2] text_match"):
        filename = "speech-{}.json".format(book_id)
        path = os.path.join(config.asset_dir, filename)
        clean_path = path.replace("speech", "clean-speech")

        if os.path.exists(clean_path):
            print(" [*] Skip {}".format(clean_path))
        else:
            results = replace_pred_with_book(path)
            with open(clean_path, "w") as f:
                json.dump(results, f, indent=2, ensure_ascii=False)

    # Dummy

    if False:
        match_paths = get_paths_by_pattern(
                config.asset_dir, 'clean-speech-*.json')

        metadata_path = os.path.join(config.data_dir, config.metadata)

        print(" [3] Merge clean-speech-*.json into {}".format(metadata_path))

        merged_data = []
        for path in match_paths:
            with open(path) as f:
                merged_data.extend(json.loads(f.read()))

        import ipdb; ipdb.set_trace()

        with open(metadata_path, 'w') as f:
            json.dump(merged_data, f, indent=2, ensure_ascii=False)