multi-speaker-tacotron-tens.../datasets/son/download.py

151 lines
4.6 KiB
Python

import re
import os
import sys
import m3u8
import json
import requests
import subprocess
from functools import partial
from bs4 import BeautifulSoup
from utils import get_encoder_name, parallel_run, makedirs
API_URL = 'http://api.jtbc.joins.com/ad/pre/NV10173083'
BASE_URL = 'http://nsvc.jtbc.joins.com/API/News/Newapp/Default.aspx'
def soupify(text):
return BeautifulSoup(text, "html.parser")
def get_news_ids(page_id):
params = {
'NJC': 'NJC300',
'CAID': 'NC10011174',
'PGI': page_id,
}
response = requests.request(
method='GET', url=BASE_URL, params=params,
)
soup = soupify(response.text)
return [item.text for item in soup.find_all('news_id')]
def download_news_video_and_content(
news_id, base_dir, chunk_size=32*1024,
video_dir="video", asset_dir="assets", audio_dir="audio"):
video_dir = os.path.join(base_dir, video_dir)
asset_dir = os.path.join(base_dir, asset_dir)
audio_dir = os.path.join(base_dir, audio_dir)
makedirs(video_dir)
makedirs(asset_dir)
makedirs(audio_dir)
text_path = os.path.join(asset_dir, "{}.txt".format(news_id))
original_text_path = os.path.join(asset_dir, "original-{}.txt".format(news_id))
video_path = os.path.join(video_dir, "{}.ts".format(news_id))
audio_path = os.path.join(audio_dir, "{}.wav".format(news_id))
params = {
'NJC': 'NJC400',
'NID': news_id, # NB11515152
'CD': 'A0100',
}
response = requests.request(
method='GET', url=BASE_URL, params=params,
)
soup = soupify(response.text)
article_contents = soup.find_all('article_contents')
assert len(article_contents) == 1, \
"# of <article_contents> of {} should be 1: {}".format(news_id, response.text)
text = soupify(article_contents[0].text).get_text() # remove <div>
with open(original_text_path, "w") as f:
f.write(text)
with open(text_path, "w") as f:
from nltk import sent_tokenize
text = re.sub(r'\[.{0,80} :\s.+]', '', text) # remove quote
text = re.sub(r'☞.+http.+\)', '', text) # remove quote
text = re.sub(r'\(https?:\/\/.*[\r\n]*\)', '', text) # remove url
sentences = sent_tokenize(text)
sentences = [sent for sentence in sentences for sent in sentence.split('\n') if sent]
new_texts = []
for sent in sentences:
sent = sent.strip()
sent = re.sub(r'\([^)]*\)', '', sent)
#sent = re.sub(r'\<.{0,80}\>', '', sent)
sent = sent.replace('', '.')
new_texts.append(sent)
f.write("\n".join([sent for sent in new_texts if sent]))
vod_paths = soup.find_all('vod_path')
assert len(vod_paths) == 1, \
"# of <vod_path> of {} should be 1: {}".format(news_id, response.text)
if not os.path.exists(video_path):
redirect_url = soup.find_all('vod_path')[0].text
list_url = m3u8.load(redirect_url).playlists[0].absolute_uri
video_urls = [segment.absolute_uri for segment in m3u8.load(list_url).segments]
with open(video_path, "wb") as f:
for url in video_urls:
response = requests.get(url, stream=True)
total_size = int(response.headers.get('content-length', 0))
for chunk in response.iter_content(chunk_size):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
if not os.path.exists(audio_path):
encoder = get_encoder_name()
command = "{} -y -loglevel panic -i {} -ab 160k -ac 2 -ar 44100 -vn {}".\
format(encoder, video_path, audio_path)
subprocess.call(command, shell=True)
return True
if __name__ == '__main__':
news_ids = []
page_idx = 1
base_dir = os.path.dirname(os.path.realpath(__file__))
news_id_path = os.path.join(base_dir, "news_ids.json")
if not os.path.exists(news_id_path):
while True:
tmp_ids = get_news_ids(page_idx)
if len(tmp_ids) == 0:
break
news_ids.extend(tmp_ids)
print(" [*] Download page {}: {}/{}".format(page_idx, len(tmp_ids), len(news_ids)))
page_idx += 1
with open(news_id_path, "w") as f:
json.dump(news_ids, f, indent=2, ensure_ascii=False)
else:
with open(news_id_path) as f:
news_ids = json.loads(f.read())
exceptions = ["NB10830162"]
news_ids = list(set(news_ids) - set(exceptions))
fn = partial(download_news_video_and_content, base_dir=base_dir)
results = parallel_run(
fn, news_ids, desc="Download news video+text", parallel=True)