Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Download video metadata at subtitle download time #127

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Download video metadata at subtitle download time
Fixes #123
  • Loading branch information
danlamanna committed Jan 20, 2024
commit 896f8fdae4ea21542d8dd9565c03e55fd3546c8d
25 changes: 12 additions & 13 deletions yt_fts/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@ def download_vtts(number_of_jobs, list_of_videos_urls, language ,tmp_dir):
def get_vtt(tmp_dir, video_url, language):
subprocess.run([
"yt-dlp",
"-o", f"{tmp_dir}/%(id)s.%(ext)s",
"-o", f"{tmp_dir}/%(id)s",
"--write-info-json",
"--write-auto-sub",
"--convert-subs", "vtt",
"--skip-download",
Expand All @@ -153,6 +154,10 @@ def vtt_to_db(channel_id, dir_path, s):
file_paths = []

for item in items:
# ignore other files e.g. info.json files
if not item.endswith('.vtt'):
continue

item_path = os.path.join(dir_path, item)
if os.path.isfile(item_path):
file_paths.append(item_path)
Expand All @@ -164,9 +169,9 @@ def vtt_to_db(channel_id, dir_path, s):

for vtt in track(file_paths, description="Adding subtitles to database..."):
base_name = os.path.basename(vtt)
vid_id = re.match(r'^([^.]*)', base_name).group(1)
vid_id = base_name.split('.')[0]
vid_url = f"https://youtu.be/{vid_id}"
vid_title = get_vid_title(vid_url, s)
vid_title = get_vid_title(os.path.join(os.path.dirname(vtt), f'{vid_id}.info.json'))
add_video(channel_id, vid_id, vid_title, vid_url)

vtt_json = parse_vtt(vtt)
Expand All @@ -183,18 +188,12 @@ def vtt_to_db(channel_id, dir_path, s):
con.close()


def get_vid_title(vid_url, s):
def get_vid_title(info_json_path):
"""
Scrapes video title from the video page
Retrieves video title from the info json file.
"""
res = s.get(vid_url)
if res.status_code == 200:
html = res.text
soup = BeautifulSoup(html, 'html.parser')
title = soup.title.string
return title
else:
return None
with open(info_json_path) as f:
return json.load(f)['title']


def validate_channel_url(channel_url):
Expand Down