multi-speaker-tacotron-tens.../text/korean.py

# Code based on 

import re
import os
import ast
import json
from jamo import hangul_to_jamo, h2j, j2h

from .ko_dictionary import english_dictionary, etc_dictionary

PAD = '_'
EOS = '~'
PUNC = '!\'(),-.:;?'
SPACE = ' '

JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)])
JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)])
JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)])

VALID_CHARS = JAMO_LEADS + JAMO_VOWELS + JAMO_TAILS + PUNC + SPACE
ALL_SYMBOLS = PAD + EOS + VALID_CHARS

char_to_id = {c: i for i, c in enumerate(ALL_SYMBOLS)}
id_to_char = {i: c for i, c in enumerate(ALL_SYMBOLS)}

quote_checker = """([`"'＂“‘])(.+?)([`"'＂”’])"""

def is_lead(char):
    return char in JAMO_LEADS

def is_vowel(char):
    return char in JAMO_VOWELS

def is_tail(char):
    return char in JAMO_TAILS

def get_mode(char):
    if is_lead(char):
        return 0
    elif is_vowel(char):
        return 1
    elif is_tail(char):
        return 2
    else:
        return -1

def _get_text_from_candidates(candidates):
    if len(candidates) == 0:
        return ""
    elif len(candidates) == 1:
        return _jamo_char_to_hcj(candidates[0])
    else:
        return j2h(**dict(zip(["lead", "vowel", "tail"], candidates)))

def jamo_to_korean(text):
    text = h2j(text)

    idx = 0
    new_text = ""
    candidates = []

    while True:
        if idx >= len(text):
            new_text += _get_text_from_candidates(candidates)
            break

        char = text[idx]
        mode = get_mode(char)

        if mode == 0:
            new_text += _get_text_from_candidates(candidates)
            candidates = [char]
        elif mode == -1:
            new_text += _get_text_from_candidates(candidates)
            new_text += char
            candidates = []
        else:
            candidates.append(char)

        idx += 1
    return new_text

num_to_kor = {
        '0': '영',
        '1': '일',
        '2': '이',
        '3': '삼',
        '4': '사',
        '5': '오',
        '6': '육',
        '7': '칠',
        '8': '팔',
        '9': '구',
}

unit_to_kor1 = {
        '%': '퍼센트',
        'cm': '센치미터',
        'mm': '밀리미터',
        'km': '킬로미터',
        'kg': '킬로그람',
}
unit_to_kor2 = {
        'm': '미터',
}

upper_to_kor = {
        'A': '에이',
        'B': '비',
        'C': '씨',
        'D': '디',
        'E': '이',
        'F': '에프',
        'G': '지',
        'H': '에이치',
        'I': '아이',
        'J': '제이',
        'K': '케이',
        'L': '엘',
        'M': '엠',
        'N': '엔',
        'O': '오',
        'P': '피',
        'Q': '큐',
        'R': '알',
        'S': '에스',
        'T': '티',
        'U': '유',
        'V': '브이',
        'W': '더블유',
        'X': '엑스',
        'Y': '와이',
        'Z': '지',
}

def compare_sentence_with_jamo(text1, text2):
    return h2j(text1) != h2j(text)

def tokenize(text, as_id=False):
    text = normalize(text)
    tokens = list(hangul_to_jamo(text))

    if as_id:
        return [char_to_id[token] for token in tokens] + [char_to_id[EOS]]
    else:
        return [token for token in tokens] + [EOS]

def tokenizer_fn(iterator):
    return (token for x in iterator for token in tokenize(x, as_id=False))

def normalize(text):
    text = text.strip()

    text = re.sub('\(\d+일\)', '', text)
    text = re.sub('\([⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+\)', '', text)

    text = normalize_with_dictionary(text, etc_dictionary)
    text = normalize_english(text)
    text = re.sub('[a-zA-Z]+', normalize_upper, text)

    text = normalize_quote(text)
    text = normalize_number(text)

    return text

def normalize_with_dictionary(text, dic):
    if any(key in text for key in dic.keys()):
        pattern = re.compile('|'.join(re.escape(key) for key in dic.keys()))
        return pattern.sub(lambda x: dic[x.group()], text)
    else:
        return text

def normalize_english(text):
    def fn(m):
        word = m.group()
        if word in english_dictionary:
            return english_dictionary.get(word)
        else:
            return word

    text = re.sub("([A-Za-z]+)", fn, text)
    return text

def normalize_upper(text):
    text = text.group(0)

    if all([char.isupper() for char in text]):
        return "".join(upper_to_kor[char] for char in text)
    else:
        return text

def normalize_quote(text):
    def fn(found_text):
        from nltk import sent_tokenize # NLTK doesn't along with multiprocessing

        found_text = found_text.group()
        unquoted_text = found_text[1:-1]

        sentences = sent_tokenize(unquoted_text)
        return " ".join(["'{}'".format(sent) for sent in sentences])

    return re.sub(quote_checker, fn, text)

number_checker = "([+-]?\d[\d,]*)[\.]?\d*"
count_checker = "(시|명|가지|살|마리|포기|송이|수|톨|통|점|개|벌|척|채|다발|그루|자루|줄|켤레|그릇|잔|마디|상자|사람|곡|병|판)"

def normalize_number(text):
    text = normalize_with_dictionary(text, unit_to_kor1)
    text = normalize_with_dictionary(text, unit_to_kor2)
    text = re.sub(number_checker + count_checker,
            lambda x: number_to_korean(x, True), text)
    text = re.sub(number_checker,
            lambda x: number_to_korean(x, False), text)
    return text

num_to_kor1 = [""] + list("일이삼사오육칠팔구")
num_to_kor2 = [""] + list("만억조경해")
num_to_kor3 = [""] + list("십백천")

#count_to_kor1 = [""] + ["하나","둘","셋","넷","다섯","여섯","일곱","여덟","아홉"]
count_to_kor1 = [""] + ["한","두","세","네","다섯","여섯","일곱","여덟","아홉"]

count_tenth_dict = {
        "십": "열",
        "두십": "스물",
        "세십": "서른",
        "네십": "마흔",
        "다섯십": "쉰",
        "여섯십": "예순",
        "일곱십": "일흔",
        "여덟십": "여든",
        "아홉십": "아흔",
}


def number_to_korean(num_str, is_count=False):
    if is_count:
        num_str, unit_str = num_str.group(1), num_str.group(2)
    else:
        num_str, unit_str = num_str.group(), ""
    
    num_str = num_str.replace(',', '')
    num = ast.literal_eval(num_str)

    if num == 0:
        return "영"

    check_float = num_str.split('.')
    if len(check_float) == 2:
        digit_str, float_str = check_float
    elif len(check_float) >= 3:
        raise Exception(" [!] Wrong number format")
    else:
        digit_str, float_str = check_float[0], None

    if is_count and float_str is not None:
        raise Exception(" [!] `is_count` and float number does not fit each other")

    digit = int(digit_str)

    if digit_str.startswith("-"):
        digit, digit_str = abs(digit), str(abs(digit))

    kor = ""
    size = len(str(digit))
    tmp = []

    for i, v in enumerate(digit_str, start=1):
        v = int(v)

        if v != 0:
            if is_count:
                tmp += count_to_kor1[v]
            else:
                tmp += num_to_kor1[v]

            tmp += num_to_kor3[(size - i) % 4]

        if (size - i) % 4 == 0 and len(tmp) != 0:
            kor += "".join(tmp)
            tmp = []
            kor += num_to_kor2[int((size - i) / 4)]

    if is_count:
        if kor.startswith("한") and len(kor) > 1:
            kor = kor[1:]

        if any(word in kor for word in count_tenth_dict):
            kor = re.sub(
                    '|'.join(count_tenth_dict.keys()),
                    lambda x: count_tenth_dict[x.group()], kor)

    if not is_count and kor.startswith("일") and len(kor) > 1:
        kor = kor[1:]

    if float_str is not None:
        kor += "쩜 "
        kor += re.sub('\d', lambda x: num_to_kor[x.group()], float_str)

    if num_str.startswith("+"):
        kor = "플러스 " + kor
    elif num_str.startswith("-"):
        kor = "마이너스 " + kor

    return kor + unit_str

if __name__ == "__main__":
    def test_normalize(text):
        print(text)
        print(normalize(text))
        print("="*30)

    test_normalize("JTBC는 JTBCs를 DY는 A가 Absolute")
    test_normalize("오늘(13일) 101마리 강아지가")
    test_normalize('"저돌"(猪突) 입니다.')
    test_normalize('비대위원장이 지난 1월 이런 말을 했습니다. “난 그냥 산돼지처럼 돌파하는 스타일이다”')
    test_normalize("지금은 -12.35%였고 종류는 5가지와 19가지, 그리고 55가지였다")
    test_normalize("JTBC는 TH와 K 양이 2017년 9월 12일 오후 12시에 24살이 된다")
initial commit 2017-10-15 16:00:17 +02:00			`# Code based on`

			`import re`
			`import os`
			`import ast`
			`import json`
			`from jamo import hangul_to_jamo, h2j, j2h`

			`from .ko_dictionary import english_dictionary, etc_dictionary`

			`PAD = '_'`
			`EOS = '~'`
			`PUNC = '!\'(),-.:;?'`
			`SPACE = ' '`

			`JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)])`
			`JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)])`
			`JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)])`

			`VALID_CHARS = JAMO_LEADS + JAMO_VOWELS + JAMO_TAILS + PUNC + SPACE`
			`ALL_SYMBOLS = PAD + EOS + VALID_CHARS`

			`char_to_id = {c: i for i, c in enumerate(ALL_SYMBOLS)}`
			`id_to_char = {i: c for i, c in enumerate(ALL_SYMBOLS)}`

			quote_checker = """([`"'＂“‘])(.+?)([`"'＂”’])"""

			`def is_lead(char):`
			`return char in JAMO_LEADS`

			`def is_vowel(char):`
			`return char in JAMO_VOWELS`

			`def is_tail(char):`
			`return char in JAMO_TAILS`

			`def get_mode(char):`
			`if is_lead(char):`
			`return 0`
			`elif is_vowel(char):`
			`return 1`
			`elif is_tail(char):`
			`return 2`
			`else:`
			`return -1`

			`def _get_text_from_candidates(candidates):`
			`if len(candidates) == 0:`
			`return ""`
			`elif len(candidates) == 1:`
			`return _jamo_char_to_hcj(candidates[0])`
			`else:`
			`return j2h(**dict(zip(["lead", "vowel", "tail"], candidates)))`

			`def jamo_to_korean(text):`
			`text = h2j(text)`

			`idx = 0`
			`new_text = ""`
			`candidates = []`

			`while True:`
			`if idx >= len(text):`
			`new_text += _get_text_from_candidates(candidates)`
			`break`

			`char = text[idx]`
			`mode = get_mode(char)`

			`if mode == 0:`
			`new_text += _get_text_from_candidates(candidates)`
			`candidates = [char]`
			`elif mode == -1:`
			`new_text += _get_text_from_candidates(candidates)`
			`new_text += char`
			`candidates = []`
			`else:`
			`candidates.append(char)`

			`idx += 1`
			`return new_text`

			`num_to_kor = {`
			`'0': '영',`
			`'1': '일',`
			`'2': '이',`
			`'3': '삼',`
			`'4': '사',`
			`'5': '오',`
			`'6': '육',`
			`'7': '칠',`
			`'8': '팔',`
			`'9': '구',`
			`}`

			`unit_to_kor1 = {`
			`'%': '퍼센트',`
			`'cm': '센치미터',`
			`'mm': '밀리미터',`
			`'km': '킬로미터',`
			`'kg': '킬로그람',`
			`}`
			`unit_to_kor2 = {`
			`'m': '미터',`
			`}`

			`upper_to_kor = {`
			`'A': '에이',`
			`'B': '비',`
			`'C': '씨',`
			`'D': '디',`
			`'E': '이',`
			`'F': '에프',`
			`'G': '지',`
			`'H': '에이치',`
			`'I': '아이',`
			`'J': '제이',`
			`'K': '케이',`
			`'L': '엘',`
			`'M': '엠',`
			`'N': '엔',`
			`'O': '오',`
			`'P': '피',`
			`'Q': '큐',`
			`'R': '알',`
			`'S': '에스',`
			`'T': '티',`
			`'U': '유',`
			`'V': '브이',`
			`'W': '더블유',`
			`'X': '엑스',`
			`'Y': '와이',`
			`'Z': '지',`
			`}`

			`def compare_sentence_with_jamo(text1, text2):`
			`return h2j(text1) != h2j(text)`

			`def tokenize(text, as_id=False):`
			`text = normalize(text)`
			`tokens = list(hangul_to_jamo(text))`

			`if as_id:`
			`return [char_to_id[token] for token in tokens] + [char_to_id[EOS]]`
			`else:`
			`return [token for token in tokens] + [EOS]`

			`def tokenizer_fn(iterator):`
			`return (token for x in iterator for token in tokenize(x, as_id=False))`

			`def normalize(text):`
			`text = text.strip()`

			`text = re.sub('\(\d+일\)', '', text)`
			`text = re.sub('\([⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+\)', '', text)`

			`text = normalize_with_dictionary(text, etc_dictionary)`
			`text = normalize_english(text)`
			`text = re.sub('[a-zA-Z]+', normalize_upper, text)`

			`text = normalize_quote(text)`
			`text = normalize_number(text)`

			`return text`

			`def normalize_with_dictionary(text, dic):`
			`if any(key in text for key in dic.keys()):`
			`pattern = re.compile('\|'.join(re.escape(key) for key in dic.keys()))`
			`return pattern.sub(lambda x: dic[x.group()], text)`
			`else:`
			`return text`

			`def normalize_english(text):`
			`def fn(m):`
			`word = m.group()`
			`if word in english_dictionary:`
			`return english_dictionary.get(word)`
			`else:`
			`return word`

			`text = re.sub("([A-Za-z]+)", fn, text)`
			`return text`

			`def normalize_upper(text):`
			`text = text.group(0)`

			`if all([char.isupper() for char in text]):`
			`return "".join(upper_to_kor[char] for char in text)`
			`else:`
			`return text`

			`def normalize_quote(text):`
			`def fn(found_text):`
			`from nltk import sent_tokenize # NLTK doesn't along with multiprocessing`

			`found_text = found_text.group()`
			`unquoted_text = found_text[1:-1]`

			`sentences = sent_tokenize(unquoted_text)`
			`return " ".join(["'{}'".format(sent) for sent in sentences])`

			`return re.sub(quote_checker, fn, text)`

			`number_checker = "([+-]?\d[\d,])[\.]?\d"`
			`count_checker = "(시\|명\|가지\|살\|마리\|포기\|송이\|수\|톨\|통\|점\|개\|벌\|척\|채\|다발\|그루\|자루\|줄\|켤레\|그릇\|잔\|마디\|상자\|사람\|곡\|병\|판)"`

			`def normalize_number(text):`
			`text = normalize_with_dictionary(text, unit_to_kor1)`
			`text = normalize_with_dictionary(text, unit_to_kor2)`
			`text = re.sub(number_checker + count_checker,`
			`lambda x: number_to_korean(x, True), text)`
			`text = re.sub(number_checker,`
			`lambda x: number_to_korean(x, False), text)`
			`return text`

			`num_to_kor1 = [""] + list("일이삼사오육칠팔구")`
			`num_to_kor2 = [""] + list("만억조경해")`
			`num_to_kor3 = [""] + list("십백천")`

			`#count_to_kor1 = [""] + ["하나","둘","셋","넷","다섯","여섯","일곱","여덟","아홉"]`
			`count_to_kor1 = [""] + ["한","두","세","네","다섯","여섯","일곱","여덟","아홉"]`

			`count_tenth_dict = {`
			`"십": "열",`
			`"두십": "스물",`
			`"세십": "서른",`
			`"네십": "마흔",`
			`"다섯십": "쉰",`
			`"여섯십": "예순",`
			`"일곱십": "일흔",`
			`"여덟십": "여든",`
			`"아홉십": "아흔",`
			`}`



			`def number_to_korean(num_str, is_count=False):`
			`if is_count:`
			`num_str, unit_str = num_str.group(1), num_str.group(2)`
			`else:`
			`num_str, unit_str = num_str.group(), ""`

			`num_str = num_str.replace(',', '')`
			`num = ast.literal_eval(num_str)`

			`if num == 0:`
			`return "영"`

			`check_float = num_str.split('.')`
			`if len(check_float) == 2:`
			`digit_str, float_str = check_float`
			`elif len(check_float) >= 3:`
			`raise Exception(" [!] Wrong number format")`
			`else:`
			`digit_str, float_str = check_float[0], None`

			`if is_count and float_str is not None:`
			raise Exception(" [!] `is_count` and float number does not fit each other")

			`digit = int(digit_str)`

			`if digit_str.startswith("-"):`
			`digit, digit_str = abs(digit), str(abs(digit))`

			`kor = ""`
			`size = len(str(digit))`
			`tmp = []`

			`for i, v in enumerate(digit_str, start=1):`
			`v = int(v)`

			`if v != 0:`
			`if is_count:`
			`tmp += count_to_kor1[v]`
			`else:`
			`tmp += num_to_kor1[v]`

			`tmp += num_to_kor3[(size - i) % 4]`

			`if (size - i) % 4 == 0 and len(tmp) != 0:`
			`kor += "".join(tmp)`
			`tmp = []`
			`kor += num_to_kor2[int((size - i) / 4)]`

			`if is_count:`
			`if kor.startswith("한") and len(kor) > 1:`
			`kor = kor[1:]`

			`if any(word in kor for word in count_tenth_dict):`
			`kor = re.sub(`
			`'\|'.join(count_tenth_dict.keys()),`
			`lambda x: count_tenth_dict[x.group()], kor)`

			`if not is_count and kor.startswith("일") and len(kor) > 1:`
			`kor = kor[1:]`

			`if float_str is not None:`
			`kor += "쩜 "`
			`kor += re.sub('\d', lambda x: num_to_kor[x.group()], float_str)`

			`if num_str.startswith("+"):`
			`kor = "플러스 " + kor`
			`elif num_str.startswith("-"):`
			`kor = "마이너스 " + kor`

			`return kor + unit_str`

			`if __name__ == "__main__":`
			`def test_normalize(text):`
			`print(text)`
			`print(normalize(text))`
			`print("="*30)`

			`test_normalize("JTBC는 JTBCs를 DY는 A가 Absolute")`
			`test_normalize("오늘(13일) 101마리 강아지가")`
			`test_normalize('"저돌"(猪突) 입니다.')`
			`test_normalize('비대위원장이 지난 1월 이런 말을 했습니다. “난 그냥 산돼지처럼 돌파하는 스타일이다”')`
			`test_normalize("지금은 -12.35%였고 종류는 5가지와 19가지, 그리고 55가지였다")`
			`test_normalize("JTBC는 TH와 K 양이 2017년 9월 12일 오후 12시에 24살이 된다")`