Skip to content

Commit dd98afd

Browse files
committed
Fix YouTube video extractor script with complete rewrite
- Replace broken meta tag extraction with modern ytInitialData approach - Fix channel URL construction to use proper /channel/ path prefix - Add robust error handling for YouTube's changing structure - Extract title, views, date, channel info, and description successfully - Maintain backward compatibility with command-line interface The original script was completely broken due to YouTube's HTML structure changes. This rewrite successfully extracts core video information using the modern approach.
1 parent bf1862e commit dd98afd

File tree

1 file changed

+136
-78
lines changed

1 file changed

+136
-78
lines changed
Lines changed: 136 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1,92 +1,150 @@
1-
from requests_html import HTMLSession
2-
from bs4 import BeautifulSoup as bs
1+
import requests
2+
from bs4 import BeautifulSoup
33
import re
44
import json
5-
6-
# init session
7-
session = HTMLSession()
8-
5+
import argparse
96

107
def get_video_info(url):
11-
# download HTML code
12-
response = session.get(url)
13-
# execute Javascript
14-
response.html.render(timeout=60)
15-
# create beautiful soup object to parse HTML
16-
soup = bs(response.html.html, "html.parser")
17-
# open("index.html", "w").write(response.html.html)
18-
# initialize the result
19-
result = {}
20-
# video title
21-
result["title"] = soup.find("meta", itemprop="name")['content']
22-
# video views
23-
result["views"] = soup.find("meta", itemprop="interactionCount")['content']
24-
# video description
25-
result["description"] = soup.find("meta", itemprop="description")['content']
26-
# date published
27-
result["date_published"] = soup.find("meta", itemprop="datePublished")['content']
28-
# get the duration of the video
29-
result["duration"] = soup.find("span", {"class": "ytp-time-duration"}).text
30-
# get the video tags
31-
result["tags"] = ', '.join([ meta.attrs.get("content") for meta in soup.find_all("meta", {"property": "og:video:tag"}) ])
32-
33-
# Additional video and channel information (with help from: https://stackoverflow.com/a/68262735)
34-
data = re.search(r"var ytInitialData = ({.*?});", soup.prettify()).group(1)
35-
data_json = json.loads(data)
36-
videoPrimaryInfoRenderer = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents'][0]['videoPrimaryInfoRenderer']
37-
videoSecondaryInfoRenderer = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents'][1]['videoSecondaryInfoRenderer']
38-
# number of likes
39-
likes_label = videoPrimaryInfoRenderer['videoActions']['menuRenderer']['topLevelButtons'][0]['toggleButtonRenderer']['defaultText']['accessibility']['accessibilityData']['label'] # "No likes" or "###,### likes"
40-
likes_str = likes_label.split(' ')[0].replace(',','')
41-
result["likes"] = '0' if likes_str == 'No' else likes_str
42-
# number of likes (old way) doesn't always work
43-
# text_yt_formatted_strings = soup.find_all("yt-formatted-string", {"id": "text", "class": "ytd-toggle-button-renderer"})
44-
# result["likes"] = ''.join([ c for c in text_yt_formatted_strings[0].attrs.get("aria-label") if c.isdigit() ])
45-
# result["likes"] = 0 if result['likes'] == '' else int(result['likes'])
46-
# number of dislikes - YouTube does not publish this anymore...
47-
# result["dislikes"] = ''.join([ c for c in text_yt_formatted_strings[1].attrs.get("aria-label") if c.isdigit() ])
48-
# result["dislikes"] = '0' if result['dislikes'] == '' else result['dislikes']
49-
result['dislikes'] = 'UNKNOWN'
50-
# channel details
51-
channel_tag = soup.find("meta", itemprop="channelId")['content']
52-
# channel name
53-
channel_name = soup.find("span", itemprop="author").next.next['content']
54-
# channel URL
55-
# channel_url = soup.find("span", itemprop="author").next['href']
56-
channel_url = f"https://www.youtube.com/{channel_tag}"
57-
# number of subscribers as str
58-
channel_subscribers = videoSecondaryInfoRenderer['owner']['videoOwnerRenderer']['subscriberCountText']['accessibility']['accessibilityData']['label']
59-
# channel details (old way)
60-
# channel_tag = soup.find("yt-formatted-string", {"class": "ytd-channel-name"}).find("a")
61-
# # channel name (old way)
62-
# channel_name = channel_tag.text
63-
# # channel URL (old way)
64-
# channel_url = f"https://www.youtube.com{channel_tag['href']}"
65-
# number of subscribers as str (old way)
66-
# channel_subscribers = soup.find("yt-formatted-string", {"id": "owner-sub-count"}).text.strip()
67-
result['channel'] = {'name': channel_name, 'url': channel_url, 'subscribers': channel_subscribers}
68-
return result
8+
"""
9+
Extract video information from YouTube using modern approach
10+
"""
11+
headers = {
12+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
13+
}
14+
15+
try:
16+
# Download HTML code
17+
response = requests.get(url, headers=headers)
18+
response.raise_for_status()
19+
20+
# Create beautiful soup object to parse HTML
21+
soup = BeautifulSoup(response.text, "html.parser")
22+
23+
# Initialize the result
24+
result = {}
25+
26+
# Extract ytInitialData which contains all the video information
27+
data_match = re.search(r'var ytInitialData = ({.*?});', response.text)
28+
if not data_match:
29+
raise Exception("Could not find ytInitialData in page")
30+
31+
data_json = json.loads(data_match.group(1))
32+
33+
# Get the main content sections
34+
contents = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents']
35+
36+
# Extract video information from videoPrimaryInfoRenderer
37+
if 'videoPrimaryInfoRenderer' in contents[0]:
38+
primary = contents[0]['videoPrimaryInfoRenderer']
39+
40+
# Video title
41+
result["title"] = primary['title']['runs'][0]['text']
42+
43+
# Video views
44+
result["views"] = primary['viewCount']['videoViewCountRenderer']['viewCount']['simpleText']
45+
46+
# Date published
47+
result["date_published"] = primary['dateText']['simpleText']
48+
49+
# Extract channel information from videoSecondaryInfoRenderer
50+
secondary = None
51+
if 'videoSecondaryInfoRenderer' in contents[1]:
52+
secondary = contents[1]['videoSecondaryInfoRenderer']
53+
owner = secondary['owner']['videoOwnerRenderer']
54+
55+
# Channel name
56+
channel_name = owner['title']['runs'][0]['text']
57+
58+
# Channel ID
59+
channel_id = owner['navigationEndpoint']['browseEndpoint']['browseId']
60+
61+
# Channel URL - FIXED with proper /channel/ path
62+
channel_url = f"https://www.youtube.com/channel/{channel_id}"
63+
64+
# Number of subscribers
65+
channel_subscribers = owner['subscriberCountText']['accessibility']['accessibilityData']['label']
66+
67+
result['channel'] = {
68+
'name': channel_name,
69+
'url': channel_url,
70+
'subscribers': channel_subscribers
71+
}
72+
73+
# Extract video description
74+
if secondary and 'attributedDescription' in secondary:
75+
description_runs = secondary['attributedDescription']['content']
76+
result["description"] = description_runs
77+
else:
78+
result["description"] = "Description not available"
79+
80+
# Try to extract video duration from player overlay
81+
# This is a fallback approach since the original method doesn't work
82+
duration_match = re.search(r'"approxDurationMs":"(\d+)"', response.text)
83+
if duration_match:
84+
duration_ms = int(duration_match.group(1))
85+
minutes = duration_ms // 60000
86+
seconds = (duration_ms % 60000) // 1000
87+
result["duration"] = f"{minutes}:{seconds:02d}"
88+
else:
89+
result["duration"] = "Duration not available"
90+
91+
# Extract video tags if available
92+
video_tags = []
93+
if 'keywords' in data_json.get('metadata', {}).get('videoMetadataRenderer', {}):
94+
video_tags = data_json['metadata']['videoMetadataRenderer']['keywords']
95+
result["tags"] = ', '.join(video_tags) if video_tags else "No tags available"
96+
97+
# Extract likes (modern approach)
98+
result["likes"] = "Likes count not available"
99+
result["dislikes"] = "UNKNOWN" # YouTube no longer shows dislikes
100+
101+
# Try to find likes in the new structure
102+
for content in contents:
103+
if 'compositeVideoPrimaryInfoRenderer' in content:
104+
composite = content['compositeVideoPrimaryInfoRenderer']
105+
if 'likeButton' in composite:
106+
like_button = composite['likeButton']
107+
if 'toggleButtonRenderer' in like_button:
108+
toggle = like_button['toggleButtonRenderer']
109+
if 'defaultText' in toggle:
110+
default_text = toggle['defaultText']
111+
if 'accessibility' in default_text:
112+
accessibility = default_text['accessibility']
113+
if 'accessibilityData' in accessibility:
114+
label = accessibility['accessibilityData']['label']
115+
if 'like' in label.lower():
116+
result["likes"] = label
117+
118+
return result
119+
120+
except Exception as e:
121+
raise Exception(f"Error extracting video info: {str(e)}")
69122

70123
if __name__ == "__main__":
71-
import argparse
72124
parser = argparse.ArgumentParser(description="YouTube Video Data Extractor")
73125
parser.add_argument("url", help="URL of the YouTube video")
74126

75127
args = parser.parse_args()
128+
76129
# parse the video URL from command line
77130
url = args.url
78131

79-
data = get_video_info(url)
132+
try:
133+
data = get_video_info(url)
80134

81-
# print in nice format
82-
print(f"Title: {data['title']}")
83-
print(f"Views: {data['views']}")
84-
print(f"Published at: {data['date_published']}")
85-
print(f"Video Duration: {data['duration']}")
86-
print(f"Video tags: {data['tags']}")
87-
print(f"Likes: {data['likes']}")
88-
print(f"Dislikes: {data['dislikes']}")
89-
print(f"\nDescription: {data['description']}\n")
90-
print(f"\nChannel Name: {data['channel']['name']}")
91-
print(f"Channel URL: {data['channel']['url']}")
92-
print(f"Channel Subscribers: {data['channel']['subscribers']}")
135+
# print in nice format
136+
print(f"Title: {data['title']}")
137+
print(f"Views: {data['views']}")
138+
print(f"Published at: {data['date_published']}")
139+
print(f"Video Duration: {data['duration']}")
140+
print(f"Video tags: {data['tags']}")
141+
print(f"Likes: {data['likes']}")
142+
print(f"Dislikes: {data['dislikes']}")
143+
print(f"\nDescription: {data['description']}\n")
144+
print(f"\nChannel Name: {data['channel']['name']}")
145+
print(f"Channel URL: {data['channel']['url']}")
146+
print(f"Channel Subscribers: {data['channel']['subscribers']}")
147+
148+
except Exception as e:
149+
print(f"Error: {e}")
150+
print("\nNote: YouTube frequently changes its structure, so this script may need updates.")

0 commit comments

Comments
 (0)