multi-speaker-tacotron-tens.../datasets/yuinna/download.py

60 lines
1.9 KiB
Python
Raw Normal View History

2017-10-15 16:00:17 +02:00
import os
import sys
import json
import argparse
import requests
from bs4 import BeautifulSoup
from functools import partial
from utils import download_with_url, makedirs, parallel_run
base_path = os.path.dirname(os.path.realpath(__file__))
RSS_URL = "http://enabler.kbs.co.kr/api/podcast_channel/feed.xml?channel_id=R2010-0440"
def itunes_download(item):
audio_dir = os.path.join(base_path, "audio")
date, url = item
path = os.path.join(audio_dir, "{}.mp4".format(date))
if not os.path.exists(path):
download_with_url(url, path)
def download_all(config):
audio_dir = os.path.join(base_path, "audio")
makedirs(audio_dir)
soup = BeautifulSoup(requests.get(RSS_URL).text, "html5lib")
items = [item for item in soup.find_all('item')]
titles = [item.find('title').text[9:-3] for item in items]
guids = [item.find('guid').text for item in items]
accept_list = ['친절한 인나씨', '반납예정일', '귀욤열매 드세요']
new_guids = [guid for title, guid in zip(titles, guids) \
if any(accept in title for accept in accept_list) and '-' not in title]
new_titles = [title for title, _ in zip(titles, guids) \
if any(accept in title for accept in accept_list) and '-' not in title]
for idx, title in enumerate(new_titles):
print(" [{:3d}] {}, {}".format(idx + 1, title,
os.path.basename(new_guids[idx]).split('_')[2]))
if idx == config.max_num: print("="*30)
urls = {
os.path.basename(guid).split('_')[2]: guid \
for guid in new_guids[:config.max_num]
}
parallel_run(itunes_download, urls.items(),
desc=" [*] Itunes download", parallel=True)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--max_num', default=100, type=int)
config, unparsed = parser.parse_known_args()
download_all(config)