import os import youtube_dl from pydub import AudioSegment from utils import makedirs, remove_file base_dir = os.path.dirname(os.path.realpath(__file__)) def get_mili_sec(text): minute, second = text.strip().split(':') return (int(minute) * 60 + int(second)) * 1000 class Data(object): def __init__( self, text_path, video_url, title, start_time, end_time): self.text_path = text_path self.video_url = video_url self.title = title self.start = get_mili_sec(start_time) self.end = get_mili_sec(end_time) def read_csv(path): with open(path) as f: data = [] for line in f: text_path, video_url, title, start_time, end_time = line.split('|') data.append(Data(text_path, video_url, title, start_time, end_time)) return data def download_audio_with_urls(data, out_ext="wav"): for d in data: original_path = os.path.join(base_dir, 'audio', os.path.basename(d.text_path)).replace('.txt', '.original.mp3') out_path = os.path.join(base_dir, 'audio', os.path.basename(d.text_path)).replace('.txt', '.wav') options = { 'format': 'bestaudio/best', 'outtmpl': original_path, 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '320', }], } with youtube_dl.YoutubeDL(options) as ydl: ydl.download([d.video_url]) audio = AudioSegment.from_file(original_path) audio[d.start:d.end].export(out_path, out_ext) remove_file(original_path) if __name__ == '__main__': makedirs(os.path.join(base_dir, "audio")) data = read_csv(os.path.join(base_dir, "metadata.csv")) download_audio_with_urls(data)