# Code based on import re import os import ast import json from jamo import hangul_to_jamo, h2j, j2h from .ko_dictionary import english_dictionary, etc_dictionary PAD = '_' EOS = '~' PUNC = '!\'(),-.:;?' SPACE = ' ' JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)]) JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)]) JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)]) VALID_CHARS = JAMO_LEADS + JAMO_VOWELS + JAMO_TAILS + PUNC + SPACE ALL_SYMBOLS = PAD + EOS + VALID_CHARS char_to_id = {c: i for i, c in enumerate(ALL_SYMBOLS)} id_to_char = {i: c for i, c in enumerate(ALL_SYMBOLS)} quote_checker = """([`"'"“‘])(.+?)([`"'"”’])""" def is_lead(char): return char in JAMO_LEADS def is_vowel(char): return char in JAMO_VOWELS def is_tail(char): return char in JAMO_TAILS def get_mode(char): if is_lead(char): return 0 elif is_vowel(char): return 1 elif is_tail(char): return 2 else: return -1 def _get_text_from_candidates(candidates): if len(candidates) == 0: return "" elif len(candidates) == 1: return _jamo_char_to_hcj(candidates[0]) else: return j2h(**dict(zip(["lead", "vowel", "tail"], candidates))) def jamo_to_korean(text): text = h2j(text) idx = 0 new_text = "" candidates = [] while True: if idx >= len(text): new_text += _get_text_from_candidates(candidates) break char = text[idx] mode = get_mode(char) if mode == 0: new_text += _get_text_from_candidates(candidates) candidates = [char] elif mode == -1: new_text += _get_text_from_candidates(candidates) new_text += char candidates = [] else: candidates.append(char) idx += 1 return new_text num_to_kor = { '0': '영', '1': '일', '2': '이', '3': '삼', '4': '사', '5': '오', '6': '육', '7': '칠', '8': '팔', '9': '구', } unit_to_kor1 = { '%': '퍼센트', 'cm': '센치미터', 'mm': '밀리미터', 'km': '킬로미터', 'kg': '킬로그람', } unit_to_kor2 = { 'm': '미터', } upper_to_kor = { 'A': '에이', 'B': '비', 'C': '씨', 'D': '디', 'E': '이', 'F': '에프', 'G': '지', 'H': '에이치', 'I': '아이', 'J': '제이', 'K': '케이', 'L': '엘', 'M': '엠', 'N': '엔', 'O': '오', 'P': '피', 'Q': '큐', 'R': '알', 'S': '에스', 'T': '티', 'U': '유', 'V': '브이', 'W': '더블유', 'X': '엑스', 'Y': '와이', 'Z': '지', } def compare_sentence_with_jamo(text1, text2): return h2j(text1) != h2j(text) def tokenize(text, as_id=False): text = normalize(text) tokens = list(hangul_to_jamo(text)) if as_id: return [char_to_id[token] for token in tokens] + [char_to_id[EOS]] else: return [token for token in tokens] + [EOS] def tokenizer_fn(iterator): return (token for x in iterator for token in tokenize(x, as_id=False)) def normalize(text): text = text.strip() text = re.sub('\(\d+일\)', '', text) text = re.sub('\([⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+\)', '', text) text = normalize_with_dictionary(text, etc_dictionary) text = normalize_english(text) text = re.sub('[a-zA-Z]+', normalize_upper, text) text = normalize_quote(text) text = normalize_number(text) return text def normalize_with_dictionary(text, dic): if any(key in text for key in dic.keys()): pattern = re.compile('|'.join(re.escape(key) for key in dic.keys())) return pattern.sub(lambda x: dic[x.group()], text) else: return text def normalize_english(text): def fn(m): word = m.group() if word in english_dictionary: return english_dictionary.get(word) else: return word text = re.sub("([A-Za-z]+)", fn, text) return text def normalize_upper(text): text = text.group(0) if all([char.isupper() for char in text]): return "".join(upper_to_kor[char] for char in text) else: return text def normalize_quote(text): def fn(found_text): from nltk import sent_tokenize # NLTK doesn't along with multiprocessing found_text = found_text.group() unquoted_text = found_text[1:-1] sentences = sent_tokenize(unquoted_text) return " ".join(["'{}'".format(sent) for sent in sentences]) return re.sub(quote_checker, fn, text) number_checker = "([+-]?\d[\d,]*)[\.]?\d*" count_checker = "(시|명|가지|살|마리|포기|송이|수|톨|통|점|개|벌|척|채|다발|그루|자루|줄|켤레|그릇|잔|마디|상자|사람|곡|병|판)" def normalize_number(text): text = normalize_with_dictionary(text, unit_to_kor1) text = normalize_with_dictionary(text, unit_to_kor2) text = re.sub(number_checker + count_checker, lambda x: number_to_korean(x, True), text) text = re.sub(number_checker, lambda x: number_to_korean(x, False), text) return text num_to_kor1 = [""] + list("일이삼사오육칠팔구") num_to_kor2 = [""] + list("만억조경해") num_to_kor3 = [""] + list("십백천") #count_to_kor1 = [""] + ["하나","둘","셋","넷","다섯","여섯","일곱","여덟","아홉"] count_to_kor1 = [""] + ["한","두","세","네","다섯","여섯","일곱","여덟","아홉"] count_tenth_dict = { "십": "열", "두십": "스물", "세십": "서른", "네십": "마흔", "다섯십": "쉰", "여섯십": "예순", "일곱십": "일흔", "여덟십": "여든", "아홉십": "아흔", } def number_to_korean(num_str, is_count=False): if is_count: num_str, unit_str = num_str.group(1), num_str.group(2) else: num_str, unit_str = num_str.group(), "" num_str = num_str.replace(',', '') num = ast.literal_eval(num_str) if num == 0: return "영" check_float = num_str.split('.') if len(check_float) == 2: digit_str, float_str = check_float elif len(check_float) >= 3: raise Exception(" [!] Wrong number format") else: digit_str, float_str = check_float[0], None if is_count and float_str is not None: raise Exception(" [!] `is_count` and float number does not fit each other") digit = int(digit_str) if digit_str.startswith("-"): digit, digit_str = abs(digit), str(abs(digit)) kor = "" size = len(str(digit)) tmp = [] for i, v in enumerate(digit_str, start=1): v = int(v) if v != 0: if is_count: tmp += count_to_kor1[v] else: tmp += num_to_kor1[v] tmp += num_to_kor3[(size - i) % 4] if (size - i) % 4 == 0 and len(tmp) != 0: kor += "".join(tmp) tmp = [] kor += num_to_kor2[int((size - i) / 4)] if is_count: if kor.startswith("한") and len(kor) > 1: kor = kor[1:] if any(word in kor for word in count_tenth_dict): kor = re.sub( '|'.join(count_tenth_dict.keys()), lambda x: count_tenth_dict[x.group()], kor) if not is_count and kor.startswith("일") and len(kor) > 1: kor = kor[1:] if float_str is not None: kor += "쩜 " kor += re.sub('\d', lambda x: num_to_kor[x.group()], float_str) if num_str.startswith("+"): kor = "플러스 " + kor elif num_str.startswith("-"): kor = "마이너스 " + kor return kor + unit_str if __name__ == "__main__": def test_normalize(text): print(text) print(normalize(text)) print("="*30) test_normalize("JTBC는 JTBCs를 DY는 A가 Absolute") test_normalize("오늘(13일) 101마리 강아지가") test_normalize('"저돌"(猪突) 입니다.') test_normalize('비대위원장이 지난 1월 이런 말을 했습니다. “난 그냥 산돼지처럼 돌파하는 스타일이다”') test_normalize("지금은 -12.35%였고 종류는 5가지와 19가지, 그리고 55가지였다") test_normalize("JTBC는 TH와 K 양이 2017년 9월 12일 오후 12시에 24살이 된다")