initial commit
This commit is contained in:
commit
0a6f6db0b1
56 changed files with 49395 additions and 0 deletions
132
.gitignore
vendored
Normal file
132
.gitignore
vendored
Normal file
|
@ -0,0 +1,132 @@
|
|||
# Text
|
||||
*.png
|
||||
*.txt
|
||||
*.json
|
||||
*.csv
|
||||
|
||||
# Data
|
||||
logs
|
||||
*.npy
|
||||
*.npz
|
||||
*.tar
|
||||
*.tar.gz
|
||||
|
||||
# Media
|
||||
*.mp4
|
||||
*.mp3
|
||||
*.flac
|
||||
*.wav
|
||||
*.ts
|
||||
|
||||
.DS_Store
|
||||
|
||||
# Created by https://www.gitignore.io/api/python,vim
|
||||
|
||||
### Python ###
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
env/
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*,cover
|
||||
.hypothesis/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# dotenv
|
||||
.env
|
||||
|
||||
# virtualenv
|
||||
.venv/
|
||||
venv/
|
||||
ENV/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
|
||||
### Vim ###
|
||||
# swap
|
||||
[._]*.s[a-v][a-z]
|
||||
[._]*.sw[a-p]
|
||||
[._]s[a-v][a-z]
|
||||
[._]sw[a-p]
|
||||
# session
|
||||
Session.vim
|
||||
# temporary
|
||||
.netrwhist
|
||||
*~
|
||||
# auto-generated tag files
|
||||
tags
|
||||
|
||||
# End of https://www.gitignore.io/api/python,vim
|
3
DISCLAIMER
Normal file
3
DISCLAIMER
Normal file
|
@ -0,0 +1,3 @@
|
|||
This is not an official [DEVSISTERS](http://devsisters.com/) product and is not responsible for misuse or for any damage that you may cause. You agree that you use this software at your own risk.
|
||||
|
||||
이것은 [데브시스터즈](http://devsisters.com/)의 공식적인 제품이 아닙니다. [데브시스터즈](http://devsisters.com )는 이 코드를 잘못 사용했을 시 발생한 문제나 이슈에 대한 책임을 지지 않으며 이 소프트웨어의 사용은 사용자 자신에>게 전적으로 책임이 있습니다.
|
40
LICENSE
Normal file
40
LICENSE
Normal file
|
@ -0,0 +1,40 @@
|
|||
Copyright (c) 2017 Devsisters
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
|
||||
|
||||
Copyright (c) 2017 Keith Ito
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
168
README.md
Normal file
168
README.md
Normal file
|
@ -0,0 +1,168 @@
|
|||
# Multi-Speaker Tacotron in TensorFlow
|
||||
|
||||
[[한국어 가이드](./README_ko.md)]
|
||||
|
||||
TensorFlow implementation of:
|
||||
|
||||
- [Deep Voice 2: Multi-Speaker Neural Text-to-Speech](https://arxiv.org/abs/1705.08947)
|
||||
- [Listening while Speaking: Speech Chain by Deep Learning](https://arxiv.org/abs/1707.04879)
|
||||
- [Tacotron: Towards End-to-End Speech Synthesis](https://arxiv.org/abs/1703.10135)
|
||||
|
||||
Samples audios (in Korean) can be found [here](http://carpedm20.github.io/tacotron/en.html).
|
||||
|
||||

|
||||
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Python 3.6+
|
||||
- [Tensorflow 1.3](https://www.tensorflow.org/install/)
|
||||
|
||||
|
||||
## Usage
|
||||
|
||||
### 1. Install prerequisites
|
||||
|
||||
After preparing [Tensorflow](https://www.tensorflow.org/install/), install prerequisites with:
|
||||
|
||||
pip3 install -r requirements.txt
|
||||
|
||||
If you want to synthesize a speech in Korean dicrectly, follow [2-3. Download pre-trained models](#2-3-download-pre-trained-models).
|
||||
|
||||
|
||||
### 2-1. Generate custom datasets
|
||||
|
||||
The `datasets` directory should look like:
|
||||
|
||||
datasets
|
||||
├── jtbc
|
||||
│ ├── alignment.json
|
||||
│ └── audio
|
||||
│ ├── 1.mp3
|
||||
│ ├── 2.mp3
|
||||
│ ├── 3.mp3
|
||||
│ └── ...
|
||||
└── YOUR_DATASET
|
||||
├── alignment.json
|
||||
└── audio
|
||||
├── 1.mp3
|
||||
├── 2.mp3
|
||||
├── 3.mp3
|
||||
└── ...
|
||||
|
||||
and `YOUR_DATASET/alignment.json` should look like:
|
||||
|
||||
{
|
||||
"./datasets/YOUR_DATASET/audio/001.mp3": "My name is Taehoon Kim.",
|
||||
"./datasets/YOUR_DATASET/audio/002.mp3": "The buses aren't the problem.",
|
||||
"./datasets/YOUR_DATASET/audio/003.mp3": "They have discovered a new particle.",
|
||||
}
|
||||
|
||||
After you prepare as described, you should genearte preprocessed data with:
|
||||
|
||||
python -m datasets.generate_data ./datasets/YOUR_DATASET/alignment.json
|
||||
|
||||
|
||||
### 2-2. Generate Korean datasets
|
||||
|
||||
You can generate datasets for 3 public Korean figures including:
|
||||
|
||||
1. [Sohn Suk-hee](https://en.wikipedia.org/wiki/Sohn_Suk-hee): anchor and president of JTBC
|
||||
2. [Park Geun-hye](https://en.wikipedia.org/wiki/Park_Geun-hye): a former President of South Korea
|
||||
3. [Moon Jae-in](https://en.wikipedia.org/wiki/Moon_Jae-in): the current President of South Korea
|
||||
|
||||
Each dataset can be generated with following scripts:
|
||||
|
||||
./scripts/prepare_son.sh # Sohn Suk-hee
|
||||
./scripts/prepare_park.sh # Park Geun-hye
|
||||
./scripts/prepare_moon.sh # Moon Jae-in
|
||||
|
||||
|
||||
Each script execute below commands. (explain with `son` dataset)
|
||||
|
||||
0. To automate an alignment between sounds and texts, prepare `GOOGLE_APPLICATION_CREDENTIALS` to use [Google Speech Recognition API](https://cloud.google.com/speech/). To get credentials, read [this](https://developers.google.com/identity/protocols/application-default-credentials).
|
||||
|
||||
export GOOGLE_APPLICATION_CREDENTIALS="YOUR-GOOGLE.CREDENTIALS.json"
|
||||
|
||||
1. Download speech(or video) and text.
|
||||
|
||||
python -m datasets.son.download
|
||||
|
||||
2. Segment all audios on silence.
|
||||
|
||||
python -m audio.silence --audio_pattern "./datasets/son/audio/*.wav" --method=pydub
|
||||
|
||||
3. By using [Google Speech Recognition API](https://cloud.google.com/speech/), we predict sentences for all segmented audios. (this is optional for `moon` and `park` because they already have `alignment.json`)
|
||||
|
||||
python -m recognition.google --audio_pattern "./datasets/son/audio/*.*.wav"
|
||||
|
||||
4. By comparing original text and recognised text, save `audio<->text` pair information into `./datasets/son/alignment.json`.
|
||||
|
||||
python -m recognition.alignment --recognition_path "./datasets/son/recognition.json" --score_threshold=0.5
|
||||
|
||||
5. Finally, generated numpy files which will be used in training.
|
||||
|
||||
python3 -m datasets.synthesizer_data ./datasets/son/alignment.json
|
||||
|
||||
Because the automatic generation is extremely naive, the dataset is noisy. However, if you have enough datasets (20+ hours with random initialization or 5+ hours with pretrained model initialization), you can expect an acceptable quality of audio synthesis.
|
||||
|
||||
|
||||
### 2-3. Download pre-trained models
|
||||
|
||||
You can download a pre-trained models or generate audio. Available models are:
|
||||
|
||||
1. Single speaker model for [Sohn Suk-hee](https://en.wikipedia.org/wiki/Sohn_Suk-hee).
|
||||
|
||||
python3 download.py son
|
||||
|
||||
2. Single speaker model for [Park Geun-hye](https://en.wikipedia.org/wiki/Park_Geun-hye).
|
||||
|
||||
python3 download.py park
|
||||
|
||||
After you donwload pre-trained models, you can generate voices as follows:
|
||||
|
||||
python3 synthesizer.py --load_path logs/son-20171015 --text "이거 실화냐?"
|
||||
python3 synthesizer.py --load_path logs/park-20171015 --text "이거 실화냐?"
|
||||
|
||||
**WARNING: The two pre-trained models are being made available for research purpose only.**
|
||||
|
||||
|
||||
### 3. Train a model
|
||||
|
||||
To train a single-speaker model:
|
||||
|
||||
python train.py --data_path=datasets/jtbc
|
||||
python train.py --data_path=datasets/park --initialize_path=PATH_TO_CHECKPOINT
|
||||
|
||||
To train a multi-speaker model:
|
||||
|
||||
python train.py --data_path=datasets/jtbc,datasets/park
|
||||
|
||||
If you don't have good and enough (10+ hours) dataset, it would be better to use `--initialize_path` to use a well-trained model as initial parameters.
|
||||
|
||||
|
||||
### 4. Synthesize audio
|
||||
|
||||
You can train your own models with:
|
||||
|
||||
python3 app.py --load_path logs/park-20171015 --num_speakers=1
|
||||
|
||||
or generate audio directly with:
|
||||
|
||||
python3 synthesizer.py --load_path logs/park-20171015 --text "이거 실화냐?"
|
||||
|
||||
|
||||
## Disclaimer
|
||||
|
||||
This is not an official [DEVSISTERS](http://devsisters.com/) product. This project is not responsible for misuse or for any damage that you may cause. You agree that you use this software at your own risk.
|
||||
|
||||
|
||||
## References
|
||||
|
||||
- [Keith Ito](https://github.com/keithito)'s [tacotron](https://github.com/keithito/tacotron)
|
||||
- [DEVIEW 2017 presentation](https://deview.kr/2017/schedule/182) (Korean)
|
||||
|
||||
|
||||
## Author
|
||||
|
||||
Taehoon Kim / [@carpedm20](http://carpedm20.github.io/)
|
169
README_ko.md
Normal file
169
README_ko.md
Normal file
|
@ -0,0 +1,169 @@
|
|||
# D.Voice: 오픈소스 딥러닝 음성 합성 엔진
|
||||
|
||||
[[English Guide](./README.md)]
|
||||
|
||||
D.Voice는 TensorFlow로 구현된 오픈소스 딥러닝 음성 합성 엔진입니다. 이 프로젝트는:
|
||||
|
||||
- [Deep Voice 2: Multi-Speaker Neural Text-to-Speech](https://arxiv.org/abs/1705.08947)
|
||||
- [Listening while Speaking: Speech Chain by Deep Learning](https://arxiv.org/abs/1707.04879)
|
||||
- [Tacotron: Towards End-to-End Speech Synthesis](https://arxiv.org/abs/1703.10135)
|
||||
|
||||
위 세 논문의 모델들의 구현을 포함하고 있습니다. 음성 데모는 [여기](http://carpedm20.github.io/tacotron/)서 들어보실 수 있습니다.
|
||||
|
||||

|
||||
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Python 3.6+
|
||||
- [Tensorflow 1.3](https://www.tensorflow.org/install/)
|
||||
|
||||
|
||||
## 사용 방법
|
||||
|
||||
### 1. 필수 라이브러리 설치
|
||||
|
||||
[Tensorflow 1.3](https://www.tensorflow.org/install/)를 설치한 후, 아래 명령어로 필수 라이브러리를 설치합니다.
|
||||
|
||||
pip3 install -r requirements.txt
|
||||
|
||||
바로 음성을 만들고 싶으면 [2-3. 학습된 모델 다운받기](#2-4-미리-학습된-모델-다운받기)를 따라하시면 됩니다.
|
||||
|
||||
|
||||
### 2-1. 학습할 데이터 준비하기
|
||||
|
||||
`datasets` 디렉토리는 다음과 같이 구성되어야 합니다:
|
||||
|
||||
datasets
|
||||
├── son
|
||||
│ ├── alignment.json
|
||||
│ └── audio
|
||||
│ ├── 1.mp3
|
||||
│ ├── 2.mp3
|
||||
│ ├── 3.mp3
|
||||
│ └── ...
|
||||
└── 아무개
|
||||
├── alignment.json
|
||||
└── audio
|
||||
├── 1.mp3
|
||||
├── 2.mp3
|
||||
├── 3.mp3
|
||||
└── ...
|
||||
|
||||
그리고 `아무개/alignment.json`는 아래와 같은 포멧으로 `json` 형태로 준비해 주세요.
|
||||
|
||||
{
|
||||
"./datasets/아무개/audio/001.mp3": "존경하는 국민 여러분",
|
||||
"./datasets/아무개/audio/002.mp3": "국회의장과 국회의원 여러분",
|
||||
"./datasets/아무개/audio/003.mp3": "저는 오늘",
|
||||
}
|
||||
|
||||
`datasets`와 `아무개/alignment.json`가 준비되면, 아래 명령어로 학습 데이터를 만드시면 됩니다:
|
||||
|
||||
python3 -m datasets.synthesizer_data ./datasets/아무개/alignment.json
|
||||
|
||||
|
||||
### 2-2. {손석희, 문재인, 박근혜} 데이터 만들기
|
||||
|
||||
만약 음성 데이터가 없으시다면, 3명의 한국인 음성 데이터를 만드실 수 있습니다:
|
||||
|
||||
1. [손석희](https://ko.wikipedia.org/wiki/%EC%86%90%EC%84%9D%ED%9D%AC)
|
||||
2. [박근혜](https://ko.wikipedia.org/wiki/%EB%B0%95%EA%B7%BC%ED%98%9C)
|
||||
3. [문재인](https://ko.wikipedia.org/wiki/%EB%AC%B8%EC%9E%AC%EC%9D%B8)
|
||||
|
||||
각각의 데이터는 아래 스크립트로 만들 수 있으며,
|
||||
|
||||
./scripts/prepare_son.sh # 손석희
|
||||
./scripts/prepare_park.sh # 박근혜
|
||||
./scripts/prepare_moon.sh # 문재인
|
||||
|
||||
|
||||
각 스크립트는 아래와 같은 명령어를 실행합니다. (son 기준으로 설명합니다)
|
||||
|
||||
0. 자동으로 `음성<->텍스트` 페어를 만들기 위해 [구글 음성 인식 API](https://cloud.google.com/speech/)를 사용하며, `GOOGLE_APPLICATION_CREDENTIALS`를 준비해야 합니다. `GOOGLE_APPLICATION_CREDENTIALS`를 얻기 위해서는 [여기](https://developers.google.com/identity/protocols/application-default-credentials)를 참고해 주세요.
|
||||
|
||||
export GOOGLE_APPLICATION_CREDENTIALS="YOUR-GOOGLE.CREDENTIALS.json"
|
||||
|
||||
1. 음성(혹은 영상)과 텍스트 데이터를 다운로드 받습니다.
|
||||
|
||||
python -m datasets.son.download
|
||||
|
||||
2. 음성을 정적을 기준으로 분리합니다.
|
||||
|
||||
python -m audio.silence --audio_pattern "./datasets/son/audio/*.wav" --method=pydub
|
||||
|
||||
3. 작게 분리된 음성들을 [Google Speech Recognition API](https://cloud.google.com/speech/)를 사용해 대략적인 문장들을 예측합니다.
|
||||
|
||||
python -m recognition.google --audio_pattern "./datasets/son/audio/*.*.wav"
|
||||
|
||||
4. 기존의 텍스트와 음성 인식으로 예측된 텍스트를 비교해 `음성<->텍스트` 쌍 정보를 `./datasets/son/alignment.json`에 저장합니다. (`moon`과 `park` 데이터셋은 `alignment.json`이 이미 있기 때문에 이 과정은 생략하셔도 됩니다.)
|
||||
|
||||
python -m recognition.alignment --recognition_path "./datasets/son/recognition.json" --score_threshold=0.5
|
||||
|
||||
5. 마지막으로 학습에 사용될 numpy 파일들을 만듭니다.
|
||||
|
||||
python3 -m datasets.synthesizer_data ./datasets/son/alignment.json
|
||||
|
||||
|
||||
자동화 과정이 굉장히 간단하기 때문에, 데이터에 노이즈가 많이 존재합니다. 하지만 오디오와 텍스트가 충분히 많이 있다면 (처음부터 학습시 20시간 이상, 미리 학습된 모델에서 학습시 5+시간 이상) 적당한 퀄리티의 음성 합성을 기대할 수 있습니다.
|
||||
|
||||
|
||||
### 2-4. 미리 학습된 모델 다운받기
|
||||
|
||||
미리 학습된 모델들을 사용해 음성을 만들거나 모델을 학습시킬 수 있습니다. 아래 모델 중 하나를 다운로드 받으시고:
|
||||
|
||||
1. 단일 화자 모델 - [손석희](https://ko.wikipedia.org/wiki/%EC%86%90%EC%84%9D%ED%9D%AC)
|
||||
|
||||
python3 download.py son
|
||||
|
||||
2. 단일 화자 모델 - [박근혜](https://ko.wikipedia.org/wiki/%EB%B0%95%EA%B7%BC%ED%98%9C)
|
||||
|
||||
python3 download.py park
|
||||
|
||||
학습된 모델을 다운받으시고, 아래 명령어로 음성을 만들어 낼 수 있습니다:
|
||||
|
||||
python3 synthesizer.py --load_path logs/son-20171015 --text "이거 실화냐?"
|
||||
python3 synthesizer.py --load_path logs/park-20171015 --text "이거 실화냐?"
|
||||
|
||||
**주의: 학습된 모델을 연구 이외의 목적으로 사용하는 것을 금지합니다.**
|
||||
|
||||
|
||||
### 3. 모델 학습하기
|
||||
|
||||
단일 화자 모델을 학습하려면:
|
||||
|
||||
python3 train.py --data_path=datasets/son
|
||||
python3 train.py --data_path=datasets/park --initialize_path logs/son-20171015
|
||||
|
||||
다중 화자 모델을 학습하려면:
|
||||
|
||||
python3 train.py --data_path=datasets/son,datasets/park
|
||||
|
||||
학습 데이터가 좋지 않다면 `--initialize_path`로 이미 학습된 모델의 파라미터로 초기화 해서 학습하시는 것이 좋습니다.
|
||||
|
||||
|
||||
### 4. 음성 만들기
|
||||
|
||||
모델을 학습시킨 후 웹 데모를 통해 음성을 만들거나:
|
||||
|
||||
python app.py --load_path logs/park-20171015 --num_speakers=1
|
||||
|
||||
아래 명령어로 음성을 만들 수 있습니다:
|
||||
|
||||
python3 synthesizer.py --load_path logs/park-20171015 --text "이거 실화냐?"
|
||||
|
||||
|
||||
## Disclaimer
|
||||
|
||||
이것은 [데브시스터즈](http://devsisters.com/)의 공식적인 제품이 아닙니다. [데브시스터즈](http://devsisters.com/)는 이 코드를 잘못 사용했을 시 발생한 문제나 이슈에 대한 책임을 지지 않으며 이 소프트웨어의 사용은 사용자 자신에게 전적으로 책임이 있습니다.
|
||||
|
||||
|
||||
## References
|
||||
|
||||
- [Keith Ito](https://github.com/keithito)'s [tacotron](https://github.com/keithito/tacotron)
|
||||
- [DEVIEW 2017 발표 자료](https://www.slideshare.net/carpedm20/deview-2017-80824162)
|
||||
|
||||
|
||||
## Author
|
||||
|
||||
Taehoon Kim / [@carpedm20](http://carpedm20.github.io/)
|
133
app.py
Normal file
133
app.py
Normal file
|
@ -0,0 +1,133 @@
|
|||
#!flask/bin/python
|
||||
import os
|
||||
import hashlib
|
||||
import argparse
|
||||
from flask_cors import CORS
|
||||
from flask import Flask, request, render_template, jsonify, \
|
||||
send_from_directory, make_response, send_file
|
||||
|
||||
from hparams import hparams
|
||||
from audio import load_audio
|
||||
from synthesizer import Synthesizer
|
||||
from utils import str2bool, prepare_dirs, makedirs, add_postfix
|
||||
|
||||
ROOT_PATH = "web"
|
||||
AUDIO_DIR = "audio"
|
||||
AUDIO_PATH = os.path.join(ROOT_PATH, AUDIO_DIR)
|
||||
|
||||
base_path = os.path.dirname(os.path.realpath(__file__))
|
||||
static_path = os.path.join(base_path, 'web/static')
|
||||
|
||||
global_config = None
|
||||
synthesizer = Synthesizer()
|
||||
app = Flask(__name__, root_path=ROOT_PATH, static_url_path='')
|
||||
CORS(app)
|
||||
|
||||
|
||||
def match_target_amplitude(sound, target_dBFS):
|
||||
change_in_dBFS = target_dBFS - sound.dBFS
|
||||
return sound.apply_gain(change_in_dBFS)
|
||||
|
||||
def amplify(path, keep_silence=300):
|
||||
sound = AudioSegment.from_file(path)
|
||||
|
||||
nonsilent_ranges = pydub.silence.detect_nonsilent(
|
||||
sound, silence_thresh=-50, min_silence_len=300)
|
||||
|
||||
new_sound = None
|
||||
for idx, (start_i, end_i) in enumerate(nonsilent_ranges):
|
||||
if idx == len(nonsilent_ranges) - 1:
|
||||
end_i = None
|
||||
|
||||
amplified_sound = \
|
||||
match_target_amplitude(sound[start_i:end_i], -20.0)
|
||||
|
||||
if idx == 0:
|
||||
new_sound = amplified_sound
|
||||
else:
|
||||
new_sound = new_sound.append(amplified_sound)
|
||||
|
||||
if idx < len(nonsilent_ranges) - 1:
|
||||
new_sound = new_sound.append(sound[end_i:nonsilent_ranges[idx+1][0]])
|
||||
|
||||
return new_sound.export("out.mp3", format="mp3")
|
||||
|
||||
def generate_audio_response(text, speaker_id):
|
||||
global global_config
|
||||
|
||||
model_name = os.path.basename(global_config.load_path)
|
||||
hashed_text = hashlib.md5(text.encode('utf-8')).hexdigest()
|
||||
|
||||
relative_dir_path = os.path.join(AUDIO_DIR, model_name)
|
||||
relative_audio_path = os.path.join(
|
||||
relative_dir_path, "{}.{}.wav".format(hashed_text, speaker_id))
|
||||
real_path = os.path.join(ROOT_PATH, relative_audio_path)
|
||||
makedirs(os.path.dirname(real_path))
|
||||
|
||||
if not os.path.exists(add_postfix(real_path, 0)):
|
||||
try:
|
||||
audio = synthesizer.synthesize(
|
||||
[text], paths=[real_path], speaker_ids=[speaker_id],
|
||||
attention_trim=True)[0]
|
||||
except:
|
||||
return jsonify(success=False), 400
|
||||
|
||||
return send_file(
|
||||
add_postfix(relative_audio_path, 0),
|
||||
mimetype="audio/wav",
|
||||
as_attachment=True,
|
||||
attachment_filename=hashed_text + ".wav")
|
||||
|
||||
response = make_response(audio)
|
||||
response.headers['Content-Type'] = 'audio/wav'
|
||||
response.headers['Content-Disposition'] = 'attachment; filename=sound.wav'
|
||||
return response
|
||||
|
||||
@app.route('/')
|
||||
def index():
|
||||
text = request.args.get('text') or "듣고 싶은 문장을 입력해 주세요."
|
||||
return render_template('index.html', text=text)
|
||||
|
||||
@app.route('/generate')
|
||||
def view_method():
|
||||
text = request.args.get('text')
|
||||
speaker_id = int(request.args.get('speaker_id'))
|
||||
|
||||
if text:
|
||||
return generate_audio_response(text, speaker_id)
|
||||
else:
|
||||
return {}
|
||||
|
||||
@app.route('/js/<path:path>')
|
||||
def send_js(path):
|
||||
return send_from_directory(
|
||||
os.path.join(static_path, 'js'), path)
|
||||
|
||||
@app.route('/css/<path:path>')
|
||||
def send_css(path):
|
||||
return send_from_directory(
|
||||
os.path.join(static_path, 'css'), path)
|
||||
|
||||
@app.route('/audio/<path:path>')
|
||||
def send_audio(path):
|
||||
return send_from_directory(
|
||||
os.path.join(static_path, 'audio'), path)
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--load_path', required=True)
|
||||
parser.add_argument('--checkpoint_step', default=None, type=int)
|
||||
parser.add_argument('--num_speakers', default=1, type=int)
|
||||
parser.add_argument('--port', default=5000, type=int)
|
||||
parser.add_argument('--debug', default=False, type=str2bool)
|
||||
config = parser.parse_args()
|
||||
|
||||
if os.path.exists(config.load_path):
|
||||
prepare_dirs(config, hparams)
|
||||
|
||||
global_config = config
|
||||
synthesizer.load(config.load_path, config.num_speakers, config.checkpoint_step)
|
||||
else:
|
||||
print(" [!] load_path not found: {}".format(config.load_path))
|
||||
|
||||
app.run(host='0.0.0.0', port=config.port, debug=config.debug)
|
BIN
assets/model.png
Normal file
BIN
assets/model.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 625 KiB |
168
audio/__init__.py
Normal file
168
audio/__init__.py
Normal file
|
@ -0,0 +1,168 @@
|
|||
# Code based on https://github.com/keithito/tacotron/blob/master/util/audio.py
|
||||
import math
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from scipy import signal
|
||||
from hparams import hparams
|
||||
|
||||
import librosa
|
||||
import librosa.filters
|
||||
|
||||
|
||||
def load_audio(path, pre_silence_length=0, post_silence_length=0):
|
||||
audio = librosa.core.load(path, sr=hparams.sample_rate)[0]
|
||||
if pre_silence_length > 0 or post_silence_length > 0:
|
||||
audio = np.concatenate([
|
||||
get_silence(pre_silence_length),
|
||||
audio,
|
||||
get_silence(post_silence_length),
|
||||
])
|
||||
return audio
|
||||
|
||||
def save_audio(audio, path, sample_rate=None):
|
||||
audio *= 32767 / max(0.01, np.max(np.abs(audio)))
|
||||
librosa.output.write_wav(path, audio.astype(np.int16),
|
||||
hparams.sample_rate if sample_rate is None else sample_rate)
|
||||
|
||||
print(" [*] Audio saved: {}".format(path))
|
||||
|
||||
|
||||
def resample_audio(audio, target_sample_rate):
|
||||
return librosa.core.resample(
|
||||
audio, hparams.sample_rate, target_sample_rate)
|
||||
|
||||
|
||||
def get_duration(audio):
|
||||
return librosa.core.get_duration(audio, sr=hparams.sample_rate)
|
||||
|
||||
|
||||
def frames_to_hours(n_frames):
|
||||
return sum((n_frame for n_frame in n_frames)) * \
|
||||
hparams.frame_shift_ms / (3600 * 1000)
|
||||
|
||||
|
||||
def get_silence(sec):
|
||||
return np.zeros(hparams.sample_rate * sec)
|
||||
|
||||
|
||||
def spectrogram(y):
|
||||
D = _stft(_preemphasis(y))
|
||||
S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
|
||||
return _normalize(S)
|
||||
|
||||
|
||||
def inv_spectrogram(spectrogram):
|
||||
S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear
|
||||
return inv_preemphasis(_griffin_lim(S ** hparams.power)) # Reconstruct phase
|
||||
|
||||
|
||||
def inv_spectrogram_tensorflow(spectrogram):
|
||||
S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db)
|
||||
return _griffin_lim_tensorflow(tf.pow(S, hparams.power))
|
||||
|
||||
|
||||
def melspectrogram(y):
|
||||
D = _stft(_preemphasis(y))
|
||||
S = _amp_to_db(_linear_to_mel(np.abs(D)))
|
||||
return _normalize(S)
|
||||
|
||||
|
||||
def inv_melspectrogram(melspectrogram):
|
||||
S = _mel_to_linear(_db_to_amp(_denormalize(melspectrogram))) # Convert back to linear
|
||||
return inv_preemphasis(_griffin_lim(S ** hparams.power)) # Reconstruct phase
|
||||
|
||||
|
||||
# Based on https://github.com/librosa/librosa/issues/434
|
||||
def _griffin_lim(S):
|
||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||
S_complex = np.abs(S).astype(np.complex)
|
||||
|
||||
y = _istft(S_complex * angles)
|
||||
for i in range(hparams.griffin_lim_iters):
|
||||
angles = np.exp(1j * np.angle(_stft(y)))
|
||||
y = _istft(S_complex * angles)
|
||||
return y
|
||||
|
||||
|
||||
def _griffin_lim_tensorflow(S):
|
||||
with tf.variable_scope('griffinlim'):
|
||||
S = tf.expand_dims(S, 0)
|
||||
S_complex = tf.identity(tf.cast(S, dtype=tf.complex64))
|
||||
y = _istft_tensorflow(S_complex)
|
||||
for i in range(hparams.griffin_lim_iters):
|
||||
est = _stft_tensorflow(y)
|
||||
angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)
|
||||
y = _istft_tensorflow(S_complex * angles)
|
||||
return tf.squeeze(y, 0)
|
||||
|
||||
|
||||
def _stft(y):
|
||||
n_fft, hop_length, win_length = _stft_parameters()
|
||||
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
|
||||
|
||||
|
||||
def _istft(y):
|
||||
_, hop_length, win_length = _stft_parameters()
|
||||
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
|
||||
|
||||
|
||||
def _stft_tensorflow(signals):
|
||||
n_fft, hop_length, win_length = _stft_parameters()
|
||||
return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False)
|
||||
|
||||
|
||||
def _istft_tensorflow(stfts):
|
||||
n_fft, hop_length, win_length = _stft_parameters()
|
||||
return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft)
|
||||
|
||||
def _stft_parameters():
|
||||
n_fft = (hparams.num_freq - 1) * 2
|
||||
hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
|
||||
win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
|
||||
return n_fft, hop_length, win_length
|
||||
|
||||
|
||||
# Conversions:
|
||||
|
||||
_mel_basis = None
|
||||
_inv_mel_basis = None
|
||||
|
||||
def _linear_to_mel(spectrogram):
|
||||
global _mel_basis
|
||||
if _mel_basis is None:
|
||||
_mel_basis = _build_mel_basis()
|
||||
return np.dot(_mel_basis, spectrogram)
|
||||
|
||||
def _mel_to_linear(mel_spectrogram):
|
||||
global _inv_mel_basis
|
||||
if _inv_mel_basis is None:
|
||||
_inv_mel_basis = np.linalg.pinv(_build_mel_basis())
|
||||
return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
|
||||
|
||||
def _build_mel_basis():
|
||||
n_fft = (hparams.num_freq - 1) * 2
|
||||
return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels)
|
||||
|
||||
def _amp_to_db(x):
|
||||
return 20 * np.log10(np.maximum(1e-5, x))
|
||||
|
||||
def _db_to_amp(x):
|
||||
return np.power(10.0, x * 0.05)
|
||||
|
||||
def _db_to_amp_tensorflow(x):
|
||||
return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)
|
||||
|
||||
def _preemphasis(x):
|
||||
return signal.lfilter([1, -hparams.preemphasis], [1], x)
|
||||
|
||||
def inv_preemphasis(x):
|
||||
return signal.lfilter([1], [1, -hparams.preemphasis], x)
|
||||
|
||||
def _normalize(S):
|
||||
return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
|
||||
|
||||
def _denormalize(S):
|
||||
return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
|
||||
|
||||
def _denormalize_tensorflow(S):
|
||||
return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
|
71
audio/get_duration.py
Normal file
71
audio/get_duration.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
import os
|
||||
import datetime
|
||||
from glob import glob
|
||||
from tqdm import tqdm
|
||||
from tinytag import TinyTag
|
||||
from collections import defaultdict
|
||||
from multiprocessing.dummy import Pool
|
||||
|
||||
from utils import load_json
|
||||
|
||||
def second_to_hour(sec):
|
||||
return str(datetime.timedelta(seconds=int(sec)))
|
||||
|
||||
def get_duration(path):
|
||||
filename = os.path.basename(path)
|
||||
candidates = filename.split('.')[0].split('_')
|
||||
dataset = candidates[0]
|
||||
|
||||
if not os.path.exists(path):
|
||||
print(" [!] {} not found".format(path))
|
||||
return dataset, 0
|
||||
|
||||
if True: # tinytag
|
||||
tag = TinyTag.get(path)
|
||||
duration = tag.duration
|
||||
else: # librosa
|
||||
y, sr = librosa.load(path)
|
||||
duration = librosa.get_duration(y=y, sr=sr)
|
||||
|
||||
return dataset, duration
|
||||
|
||||
def get_durations(paths, print_detail=True):
|
||||
duration_all = 0
|
||||
duration_book = defaultdict(list)
|
||||
|
||||
pool = Pool()
|
||||
iterator = pool.imap_unordered(get_duration, paths)
|
||||
for dataset, duration in tqdm(iterator, total=len(paths)):
|
||||
duration_all += duration
|
||||
duration_book[dataset].append(duration)
|
||||
|
||||
total_count = 0
|
||||
for book, duration in duration_book.items():
|
||||
if book:
|
||||
time = second_to_hour(sum(duration))
|
||||
file_count = len(duration)
|
||||
total_count += file_count
|
||||
|
||||
if print_detail:
|
||||
print(" [*] Duration of {}: {} (file #: {})". \
|
||||
format(book, time, file_count))
|
||||
|
||||
print(" [*] Total Duration : {} (file #: {})". \
|
||||
format(second_to_hour(duration_all), total_count))
|
||||
print()
|
||||
return duration_all
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--audio-pattern', default=None) # datasets/krbook/audio/*.wav
|
||||
parser.add_argument('--data-path', default=None) # datasets/jtbc/alignment.json
|
||||
config, unparsed = parser.parse_known_args()
|
||||
|
||||
if config.audio_pattern is not None:
|
||||
duration = get_durations(get_paths_by_pattern(config.data_dir))
|
||||
elif config.data_path is not None:
|
||||
paths = load_json(config.data_path).keys()
|
||||
duration = get_durations(paths)
|
520
audio/google_speech.py
Normal file
520
audio/google_speech.py
Normal file
|
@ -0,0 +1,520 @@
|
|||
import io
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import string
|
||||
import argparse
|
||||
import operator
|
||||
import numpy as np
|
||||
from glob import glob
|
||||
from tqdm import tqdm
|
||||
from nltk import ngrams
|
||||
from difflib import SequenceMatcher
|
||||
from collections import defaultdict
|
||||
|
||||
from google.cloud import speech
|
||||
from google.cloud.speech import enums
|
||||
from google.cloud.speech import types
|
||||
|
||||
from utils import parallel_run
|
||||
from text import text_to_sequence
|
||||
|
||||
####################################################
|
||||
# When one or two audio is missed in the middle
|
||||
####################################################
|
||||
|
||||
def get_continuous_audio_paths(paths, debug=False):
|
||||
audio_ids = get_audio_ids_from_paths(paths)
|
||||
min_id, max_id = min(audio_ids), max(audio_ids)
|
||||
|
||||
if int(max_id) - int(min_id) + 1 != len(audio_ids):
|
||||
base_path = paths[0].replace(min_id, "{:0" + str(len(max_id)) + "d}")
|
||||
new_paths = [
|
||||
base_path.format(audio_id) \
|
||||
for audio_id in range(int(min_id), int(max_id) + 1)]
|
||||
|
||||
if debug: print("Missing audio : {} -> {}".format(paths, new_paths))
|
||||
return new_paths
|
||||
else:
|
||||
return paths
|
||||
|
||||
def get_argmax_key(info, with_value=False):
|
||||
max_key = max(info.keys(), key=(lambda k: info[k]))
|
||||
|
||||
if with_value:
|
||||
return max_key, info[max_key]
|
||||
else:
|
||||
return max_key
|
||||
|
||||
def similarity(text_a, text_b):
|
||||
text_a = "".join(remove_puncuations(text_a.strip()).split())
|
||||
text_b = "".join(remove_puncuations(text_b.strip()).split())
|
||||
|
||||
score = SequenceMatcher(None, text_a, text_b).ratio()
|
||||
#score = 1 / (distance(decompose_ko_text(text_a), decompose_ko_text(text_b)) + 1e-5)
|
||||
#score = SequenceMatcher(None,
|
||||
# decompose_ko_text(text_a), decompose_ko_text(text_b)).ratio()
|
||||
|
||||
if len(text_a) < len(text_b):
|
||||
return -1 + score
|
||||
else:
|
||||
return score
|
||||
|
||||
def get_key_value_sorted(data):
|
||||
keys = list(data.keys())
|
||||
keys.sort()
|
||||
values = [data[key] for key in keys]
|
||||
return keys, values
|
||||
|
||||
def replace_pred_with_book(
|
||||
path, book_path=None, threshold=0.9, max_candidate_num=5,
|
||||
min_post_char_check=2, max_post_char_check=7, max_n=5,
|
||||
max_allow_missing_when_matching=4, debug=False):
|
||||
|
||||
#######################################
|
||||
# find text book from pred
|
||||
#######################################
|
||||
|
||||
if book_path is None:
|
||||
book_path = path.replace("speech", "text").replace("json", "txt")
|
||||
|
||||
data = json.loads(open(path).read())
|
||||
|
||||
keys, preds = get_key_value_sorted(data)
|
||||
|
||||
book_words = [word for word in open(book_path).read().split() if word != "=="]
|
||||
book_texts = [text.replace('\n', '') for text in open(book_path).readlines()]
|
||||
|
||||
loc = 0
|
||||
prev_key = None
|
||||
force_stop = False
|
||||
prev_end_loc = -1
|
||||
prev_sentence_ended = True
|
||||
|
||||
prev_empty_skip = False
|
||||
prev_not_found_skip = False
|
||||
|
||||
black_lists = ["160.{:04d}".format(audio_id) for audio_id in range(20, 36)]
|
||||
|
||||
new_preds = {}
|
||||
for key, pred in zip(keys, preds):
|
||||
if debug: print(key, pred)
|
||||
|
||||
if pred == "" or key in black_lists:
|
||||
prev_empty_skip = True
|
||||
continue
|
||||
|
||||
width, counter = 1, 0
|
||||
sim_dict, loc_dict = {}, {}
|
||||
|
||||
while True:
|
||||
words = book_words[loc:loc + width]
|
||||
|
||||
if len(words) == 0:
|
||||
print("Force stop. Left {}, Del {} {}". \
|
||||
format(len(preds) - len(new_preds), new_preds[prev_key], prev_key))
|
||||
new_preds.pop(prev_key, None)
|
||||
force_stop = True
|
||||
break
|
||||
|
||||
candidate_candidates = {}
|
||||
|
||||
for _pred in list(set([pred, koreanize_numbers(pred)])):
|
||||
max_skip = 0 if has_number(_pred[0]) or \
|
||||
_pred[0] in """"'“”’‘’""" else len(words)
|
||||
|
||||
end_sims = []
|
||||
for idx in range(min(max_skip, 10)):
|
||||
text = " ".join(words[idx:])
|
||||
|
||||
################################################
|
||||
# Score of trailing sentence is also important
|
||||
################################################
|
||||
|
||||
for jdx in range(min_post_char_check,
|
||||
max_post_char_check):
|
||||
sim = similarity(
|
||||
"".join(_pred.split())[-jdx:],
|
||||
"".join(text.split())[-jdx:])
|
||||
end_sims.append(sim)
|
||||
|
||||
candidate_candidates[text] = similarity(_pred, text)
|
||||
|
||||
candidate, sim = get_argmax_key(
|
||||
candidate_candidates, with_value=True)
|
||||
|
||||
if sim > threshold or max(end_sims + [-1]) > threshold - 0.2 or \
|
||||
len(sim_dict) > 0:
|
||||
sim_dict[candidate] = sim
|
||||
loc_dict[candidate] = loc + width
|
||||
|
||||
if len(sim_dict) > 0:
|
||||
counter += 1
|
||||
|
||||
if counter > max_candidate_num:
|
||||
break
|
||||
|
||||
width += 1
|
||||
|
||||
if width - len(_pred.split()) > 5:
|
||||
break
|
||||
|
||||
if force_stop:
|
||||
break
|
||||
|
||||
if len(sim_dict) != 0:
|
||||
#############################################################
|
||||
# Check missing words between prev pred and current pred
|
||||
#############################################################
|
||||
|
||||
if prev_key is not None:
|
||||
cur_idx = int(key.rsplit('.', 2)[-2])
|
||||
prev_idx = int(prev_key.rsplit('.', 2)[-2])
|
||||
|
||||
if cur_idx - prev_idx > 10:
|
||||
force_stop = True
|
||||
break
|
||||
|
||||
# word alinged based on prediction but may contain missing words
|
||||
# because google speech recognition sometimes skip one or two word
|
||||
# ex. ('오누이는 서로 자기가 할 일을 정했다.', '서로 자기가 할 일을 정했다.')
|
||||
original_candidate = new_candidate = get_argmax_key(sim_dict)
|
||||
|
||||
word_to_find = original_candidate.split()[0]
|
||||
|
||||
if not prev_empty_skip:
|
||||
search_idx = book_words[prev_end_loc:].index(word_to_find) \
|
||||
if word_to_find in book_words[prev_end_loc:] else -1
|
||||
|
||||
if 0 < search_idx < 4 and not prev_sentence_ended:
|
||||
words_to_check = book_words[prev_end_loc:prev_end_loc + search_idx]
|
||||
|
||||
if ends_with_punctuation(words_to_check[0]) == True:
|
||||
tmp = " ".join([new_preds[prev_key]] + words_to_check[:1])
|
||||
if debug: print(prev_key, tmp, new_preds[prev_key])
|
||||
new_preds[prev_key] = tmp
|
||||
|
||||
prev_end_loc += 1
|
||||
prev_sentence_ended = True
|
||||
|
||||
search_idx = book_words[prev_end_loc:].index(word_to_find) \
|
||||
if word_to_find in book_words[prev_end_loc:] else -1
|
||||
|
||||
if 0 < search_idx < 4 and prev_sentence_ended:
|
||||
words_to_check = book_words[prev_end_loc:prev_end_loc + search_idx]
|
||||
|
||||
if not any(ends_with_punctuation(word) for word in words_to_check):
|
||||
new_candidate = " ".join(words_to_check + [original_candidate])
|
||||
if debug: print(key, new_candidate, original_candidate)
|
||||
|
||||
new_preds[key] = new_candidate
|
||||
prev_sentence_ended = ends_with_punctuation(new_candidate)
|
||||
|
||||
loc = loc_dict[original_candidate]
|
||||
prev_key = key
|
||||
prev_not_found_skip = False
|
||||
else:
|
||||
loc += len(_pred.split()) - 1
|
||||
prev_sentence_ended = True
|
||||
prev_not_found_skip = True
|
||||
|
||||
prev_end_loc = loc
|
||||
prev_empty_skip = False
|
||||
|
||||
if debug:
|
||||
print("=", pred)
|
||||
print("=", new_preds[key], loc)
|
||||
|
||||
if force_stop:
|
||||
print(" [!] Force stop: {}".format(path))
|
||||
|
||||
align_diff = loc - len(book_words)
|
||||
|
||||
if abs(align_diff) > 10:
|
||||
print(" => Align result of {}: {} - {} = {}".format(path, loc, len(book_words), align_diff))
|
||||
|
||||
#######################################
|
||||
# find exact match of n-gram of pred
|
||||
#######################################
|
||||
|
||||
finished_ids = []
|
||||
|
||||
keys, preds = get_key_value_sorted(new_preds)
|
||||
|
||||
if abs(align_diff) > 10:
|
||||
keys, preds = keys[:-30], preds[:-30]
|
||||
|
||||
unfinished_ids = range(len(keys))
|
||||
text_matches = []
|
||||
|
||||
for n in range(max_n, 1, -1):
|
||||
ngram_preds = ngrams(preds, n)
|
||||
|
||||
for n_allow_missing in range(0, max_allow_missing_when_matching + 1):
|
||||
unfinished_ids = list(set(unfinished_ids) - set(finished_ids))
|
||||
|
||||
existing_ngram_preds = []
|
||||
|
||||
for ngram in ngram_preds:
|
||||
for text in book_texts:
|
||||
candidates = [
|
||||
" ".join(text.split()[:-n_allow_missing]),
|
||||
" ".join(text.split()[n_allow_missing:]),
|
||||
]
|
||||
for tmp_text in candidates:
|
||||
if " ".join(ngram) == tmp_text:
|
||||
existing_ngram_preds.append(ngram)
|
||||
break
|
||||
|
||||
tmp_keys = []
|
||||
cur_ngram = []
|
||||
|
||||
ngram_idx = 0
|
||||
ngram_found = False
|
||||
|
||||
for id_idx in unfinished_ids:
|
||||
key, pred = keys[id_idx], preds[id_idx]
|
||||
|
||||
if ngram_idx >= len(existing_ngram_preds):
|
||||
break
|
||||
|
||||
cur_ngram = existing_ngram_preds[ngram_idx]
|
||||
|
||||
if pred in cur_ngram:
|
||||
ngram_found = True
|
||||
|
||||
tmp_keys.append(key)
|
||||
finished_ids.append(id_idx)
|
||||
|
||||
if len(tmp_keys) == len(cur_ngram):
|
||||
if debug: print(n_allow_missing, tmp_keys, cur_ngram)
|
||||
|
||||
tmp_keys = get_continuous_audio_paths(tmp_keys, debug)
|
||||
text_matches.append(
|
||||
[[" ".join(cur_ngram)], tmp_keys]
|
||||
)
|
||||
|
||||
ngram_idx += 1
|
||||
tmp_keys = []
|
||||
cur_ngram = []
|
||||
else:
|
||||
if pred == cur_ngram[-1]:
|
||||
ngram_idx += 1
|
||||
tmp_keys = []
|
||||
cur_ngram = []
|
||||
else:
|
||||
if len(tmp_keys) > 0:
|
||||
ngram_found = False
|
||||
|
||||
tmp_keys = []
|
||||
cur_ngram = []
|
||||
|
||||
for id_idx in range(len(keys)):
|
||||
if id_idx not in finished_ids:
|
||||
key, pred = keys[id_idx], preds[id_idx]
|
||||
|
||||
text_matches.append(
|
||||
[[pred], [key]]
|
||||
)
|
||||
|
||||
##############################################################
|
||||
# ngram again for just in case after adding missing words
|
||||
##############################################################
|
||||
|
||||
max_keys = [max(get_audio_ids_from_paths(item[1], as_int=True)) for item in text_matches]
|
||||
sorted_text_matches = \
|
||||
[item for _, item in sorted(zip(max_keys, text_matches))]
|
||||
|
||||
preds = [item[0][0] for item in sorted_text_matches]
|
||||
keys = [item[1] for item in sorted_text_matches]
|
||||
|
||||
def book_sentence_idx_search(query, book_texts):
|
||||
for idx, text in enumerate(book_texts):
|
||||
if query in text:
|
||||
return idx, text
|
||||
return False, False
|
||||
|
||||
text_matches = []
|
||||
idx, book_cursor_idx = 0, 0
|
||||
|
||||
if len(preds) == 0:
|
||||
return []
|
||||
|
||||
while True:
|
||||
tmp_texts = book_texts[book_cursor_idx:]
|
||||
|
||||
jdx = 0
|
||||
tmp_pred = preds[idx]
|
||||
idxes_to_merge = [idx]
|
||||
|
||||
prev_sent_idx, prev_sent = book_sentence_idx_search(tmp_pred, tmp_texts)
|
||||
while idx + jdx + 1 < len(preds):
|
||||
jdx += 1
|
||||
|
||||
tmp_pred = preds[idx + jdx]
|
||||
sent_idx, sent = book_sentence_idx_search(tmp_pred, tmp_texts)
|
||||
|
||||
if not sent_idx:
|
||||
if debug: print(" [!] NOT FOUND: {}".format(tmp_pred))
|
||||
break
|
||||
|
||||
if prev_sent_idx == sent_idx:
|
||||
idxes_to_merge.append(idx + jdx)
|
||||
else:
|
||||
break
|
||||
|
||||
new_keys = get_continuous_audio_paths(
|
||||
sum([keys[jdx] for jdx in idxes_to_merge], []))
|
||||
text_matches.append([ [tmp_texts[prev_sent_idx]], new_keys ])
|
||||
|
||||
if len(new_keys) > 1:
|
||||
book_cursor_idx += 1
|
||||
|
||||
book_cursor_idx = max(book_cursor_idx, sent_idx)
|
||||
|
||||
if idx == len(preds) - 1:
|
||||
break
|
||||
idx = idx + jdx
|
||||
|
||||
# Counter([len(i) for i in text_matches.values()])
|
||||
return text_matches
|
||||
|
||||
def get_text_from_audio_batch(paths, multi_process=False):
|
||||
results = {}
|
||||
items = parallel_run(get_text_from_audio, paths,
|
||||
desc="get_text_from_audio_batch")
|
||||
for item in items:
|
||||
results.update(item)
|
||||
return results
|
||||
|
||||
def get_text_from_audio(path):
|
||||
error_count = 0
|
||||
|
||||
txt_path = path.replace('flac', 'txt')
|
||||
|
||||
if os.path.exists(txt_path):
|
||||
with open(txt_path) as f:
|
||||
out = json.loads(open(txt_path).read())
|
||||
return out
|
||||
|
||||
out = {}
|
||||
while True:
|
||||
try:
|
||||
client = speech.SpeechClient()
|
||||
|
||||
with io.open(path, 'rb') as audio_file:
|
||||
content = audio_file.read()
|
||||
audio = types.RecognitionAudio(content=content)
|
||||
|
||||
config = types.RecognitionConfig(
|
||||
encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
|
||||
sample_rate_hertz=16000,
|
||||
language_code='ko-KR')
|
||||
|
||||
response = client.recognize(config, audio)
|
||||
if len(response.results) > 0:
|
||||
alternatives = response.results[0].alternatives
|
||||
|
||||
results = [alternative.transcript for alternative in alternatives]
|
||||
assert len(results) == 1, "More than 1 results: {}".format(results)
|
||||
|
||||
out = { path: "" if len(results) == 0 else results[0] }
|
||||
print(results[0])
|
||||
break
|
||||
break
|
||||
except:
|
||||
error_count += 1
|
||||
print("Skip warning for {} for {} times". \
|
||||
format(path, error_count))
|
||||
|
||||
if error_count > 5:
|
||||
break
|
||||
else:
|
||||
continue
|
||||
|
||||
with open(txt_path, 'w') as f:
|
||||
json.dump(out, f, indent=2, ensure_ascii=False)
|
||||
|
||||
return out
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--asset-dir', type=str, default='assets')
|
||||
parser.add_argument('--data-dir', type=str, default='audio')
|
||||
parser.add_argument('--pattern', type=str, default="audio/*.flac")
|
||||
parser.add_argument('--metadata', type=str, default="metadata.json")
|
||||
config, unparsed = parser.parse_known_args()
|
||||
|
||||
paths = glob(config.pattern)
|
||||
paths.sort()
|
||||
paths = paths
|
||||
|
||||
book_ids = list(set([
|
||||
os.path.basename(path).split('.', 1)[0] for path in paths]))
|
||||
book_ids.sort()
|
||||
|
||||
def get_finished_ids():
|
||||
finished_paths = glob(os.path.join(
|
||||
config.asset_dir, "speech-*.json"))
|
||||
finished_ids = list(set([
|
||||
os.path.basename(path).split('.', 1)[0].replace("speech-", "") for path in finished_paths]))
|
||||
finished_ids.sort()
|
||||
return finished_ids
|
||||
|
||||
finished_ids = get_finished_ids()
|
||||
|
||||
print("# Finished : {}/{}".format(len(finished_ids), len(book_ids)))
|
||||
|
||||
book_ids_to_parse = list(set(book_ids) - set(finished_ids))
|
||||
book_ids_to_parse.sort()
|
||||
|
||||
assert os.path.exists(config.asset_dir), "assert_dir not found"
|
||||
|
||||
pbar = tqdm(book_ids_to_parse, "[1] google_speech",
|
||||
initial=len(finished_ids), total=len(book_ids))
|
||||
|
||||
for book_id in pbar:
|
||||
current_paths = glob(config.pattern.replace("*", "{}.*".format(book_id)))
|
||||
pbar.set_description("[1] google_speech : {}".format(book_id))
|
||||
|
||||
results = get_text_from_audio_batch(current_paths)
|
||||
|
||||
filename = "speech-{}.json".format(book_id)
|
||||
path = os.path.join(config.asset_dir, filename)
|
||||
|
||||
with open(path, "w") as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
|
||||
finished_ids = get_finished_ids()
|
||||
|
||||
for book_id in tqdm(finished_ids, "[2] text_match"):
|
||||
filename = "speech-{}.json".format(book_id)
|
||||
path = os.path.join(config.asset_dir, filename)
|
||||
clean_path = path.replace("speech", "clean-speech")
|
||||
|
||||
if os.path.exists(clean_path):
|
||||
print(" [*] Skip {}".format(clean_path))
|
||||
else:
|
||||
results = replace_pred_with_book(path)
|
||||
with open(clean_path, "w") as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Dummy
|
||||
|
||||
if False:
|
||||
match_paths = get_paths_by_pattern(
|
||||
config.asset_dir, 'clean-speech-*.json')
|
||||
|
||||
metadata_path = os.path.join(config.data_dir, config.metadata)
|
||||
|
||||
print(" [3] Merge clean-speech-*.json into {}".format(metadata_path))
|
||||
|
||||
merged_data = []
|
||||
for path in match_paths:
|
||||
with open(path) as f:
|
||||
merged_data.extend(json.loads(f.read()))
|
||||
|
||||
import ipdb; ipdb.set_trace()
|
||||
|
||||
with open(metadata_path, 'w') as f:
|
||||
json.dump(merged_data, f, indent=2, ensure_ascii=False)
|
143
audio/silence.py
Normal file
143
audio/silence.py
Normal file
|
@ -0,0 +1,143 @@
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import librosa
|
||||
import argparse
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from glob import glob
|
||||
from pydub import silence
|
||||
from pydub import AudioSegment
|
||||
from functools import partial
|
||||
|
||||
from hparams import hparams
|
||||
from utils import parallel_run, add_postfix
|
||||
from audio import load_audio, save_audio, get_duration, get_silence
|
||||
|
||||
def abs_mean(x):
|
||||
return abs(x).mean()
|
||||
|
||||
def remove_breath(audio):
|
||||
edges = librosa.effects.split(
|
||||
audio, top_db=40, frame_length=128, hop_length=32)
|
||||
|
||||
for idx in range(len(edges)):
|
||||
start_idx, end_idx = edges[idx][0], edges[idx][1]
|
||||
if start_idx < len(audio):
|
||||
if abs_mean(audio[start_idx:end_idx]) < abs_mean(audio) - 0.05:
|
||||
audio[start_idx:end_idx] = 0
|
||||
|
||||
return audio
|
||||
|
||||
def split_on_silence_with_librosa(
|
||||
audio_path, top_db=40, frame_length=1024, hop_length=256,
|
||||
skip_idx=0, out_ext="wav",
|
||||
min_segment_length=3, max_segment_length=8,
|
||||
pre_silence_length=0, post_silence_length=0):
|
||||
|
||||
filename = os.path.basename(audio_path).split('.', 1)[0]
|
||||
in_ext = audio_path.rsplit(".")[1]
|
||||
|
||||
audio = load_audio(audio_path)
|
||||
|
||||
edges = librosa.effects.split(audio,
|
||||
top_db=top_db, frame_length=frame_length, hop_length=hop_length)
|
||||
|
||||
new_audio = np.zeros_like(audio)
|
||||
for idx, (start, end) in enumerate(edges[skip_idx:]):
|
||||
new_audio[start:end] = remove_breath(audio[start:end])
|
||||
|
||||
save_audio(new_audio, add_postfix(audio_path, "no_breath"))
|
||||
audio = new_audio
|
||||
edges = librosa.effects.split(audio,
|
||||
top_db=top_db, frame_length=frame_length, hop_length=hop_length)
|
||||
|
||||
audio_paths = []
|
||||
for idx, (start, end) in enumerate(edges[skip_idx:]):
|
||||
segment = audio[start:end]
|
||||
duration = get_duration(segment)
|
||||
|
||||
if duration <= min_segment_length or duration >= max_segment_length:
|
||||
continue
|
||||
|
||||
output_path = "{}/{}.{:04d}.{}".format(
|
||||
os.path.dirname(audio_path), filename, idx, out_ext)
|
||||
|
||||
padded_segment = np.concatenate([
|
||||
get_silence(pre_silence_length),
|
||||
segment,
|
||||
get_silence(post_silence_length),
|
||||
])
|
||||
|
||||
save_audio(padded_segment, output_path)
|
||||
audio_paths.append(output_path)
|
||||
|
||||
return audio_paths
|
||||
|
||||
def read_audio(audio_path):
|
||||
return AudioSegment.from_file(audio_path)
|
||||
|
||||
def split_on_silence_with_pydub(
|
||||
audio_path, skip_idx=0, out_ext="wav",
|
||||
silence_thresh=-40, min_silence_len=400,
|
||||
silence_chunk_len=100, keep_silence=100):
|
||||
|
||||
filename = os.path.basename(audio_path).split('.', 1)[0]
|
||||
in_ext = audio_path.rsplit(".")[1]
|
||||
|
||||
audio = read_audio(audio_path)
|
||||
not_silence_ranges = silence.detect_nonsilent(
|
||||
audio, min_silence_len=silence_chunk_len,
|
||||
silence_thresh=silence_thresh)
|
||||
|
||||
edges = [not_silence_ranges[0]]
|
||||
|
||||
for idx in range(1, len(not_silence_ranges)-1):
|
||||
cur_start = not_silence_ranges[idx][0]
|
||||
prev_end = edges[-1][1]
|
||||
|
||||
if cur_start - prev_end < min_silence_len:
|
||||
edges[-1][1] = not_silence_ranges[idx][1]
|
||||
else:
|
||||
edges.append(not_silence_ranges[idx])
|
||||
|
||||
audio_paths = []
|
||||
for idx, (start_idx, end_idx) in enumerate(edges[skip_idx:]):
|
||||
start_idx = max(0, start_idx - keep_silence)
|
||||
end_idx += keep_silence
|
||||
|
||||
target_audio_path = "{}/{}.{:04d}.{}".format(
|
||||
os.path.dirname(audio_path), filename, idx, out_ext)
|
||||
|
||||
audio[start_idx:end_idx].export(target_audio_path, out_ext)
|
||||
|
||||
audio_paths.append(target_audio_path)
|
||||
|
||||
|