Posted on Oct 4, 2024
Fetch Youtube video transcripts of travel creators

import requests import csv import time import pandas as pd API_KEY = 'YOUR_API_KEY' # Replace with your actual YouTube Data API key # List of search keywords related to travel SEARCH_KEYWORDS = [ 'travel vlog', 'travel guide', 'food travel', 'adventure travel', 'backpacking', 'cultural travel experiences', 'travel tips', 'travel destinations', 'travel blogger', 'world travel', 'wanderlust', 'exploring the world', 'travel channel', 'travel documentary', 'road trips' ] # Base URL for YouTube Data API BASE_URL = 'https://www.googleapis.com/youtube/v3' def collect_travel_channels(api_key, search_keywords): """Collects travel-related YouTube channels based on search keywords.""" channels = [] channel_ids_set = set() for keyword in search_keywords: print(f"Searching for keyword: {keyword}") params = { 'part': 'snippet', 'q': keyword, 'type': 'channel', 'maxResults': 50, 'key': api_key } response = requests.get(f"{BASE_URL}/search", params=params) result = response.json() if 'items' in result: for item in result['items']: channel_id = item['snippet']['channelId'] if channel_id not in channel_ids_set: channel_ids_set.add(channel_id) channels.append({ 'channelId': channel_id, 'channelTitle': item['snippet']['channelTitle'], 'description': item['snippet']['description'] }) else: print(f"Error in response: {result}") # Pause to respect API rate limits time.sleep(1) # Save channels to CSV channels_df = pd.DataFrame(channels) channels_df.to_csv('travel_channels.csv', index=False, encoding='utf-8') print("Finished collecting channel data.") def collect_channel_videos(api_key): """Retrieves video lists for each channel.""" channels_df = pd.read_csv('travel_channels.csv', encoding='utf-8') videos = [] for index, row in channels_df.iterrows(): channel_id = row['channelId'] channel_title = row['channelTitle'] print(f"Processing channel: {channel_title} (ID: {channel_id})") # Get uploads playlist ID uploads_playlist_id = get_uploads_playlist_id(api_key, channel_id) if uploads_playlist_id: # Get videos from playlist channel_videos = get_videos_from_playlist(api_key, uploads_playlist_id) for video in channel_videos: videos.append({ 'channelId': channel_id, 'channelTitle': channel_title, 'videoId': video['videoId'], 'videoTitle': video['videoTitle'], 'publishedAt': video['publishedAt'] }) else: print(f"Skipping channel {channel_title} due to missing uploads playlist.") # Pause between channels time.sleep(1) # Save videos to CSV videos_df = pd.DataFrame(videos) videos_df.to_csv('channel_videos.csv', index=False, encoding='utf-8') print("Finished collecting video data.") def get_uploads_playlist_id(api_key, channel_id): """Retrieves the uploads playlist ID for a given channel.""" params = { 'part': 'contentDetails', 'id': channel_id, 'key': api_key } response = requests.get(f"{BASE_URL}/channels", params=params) result = response.json() if 'items' in result and len(result['items']) > 0: uploads_playlist_id = result['items'][0]['contentDetails']['relatedPlaylists']['uploads'] return uploads_playlist_id else: print(f"Could not get uploads playlist for channel ID: {channel_id}") return None def get_videos_from_playlist(api_key, playlist_id): """Retrieves all videos from a playlist.""" videos = [] params = { 'part': 'snippet,contentDetails', 'playlistId': playlist_id, 'maxResults': 50, 'key': api_key } while True: response = requests.get(f"{BASE_URL}/playlistItems", params=params) result = response.json() if 'items' in result: for item in result['items']: video_id = item['contentDetails']['videoId'] video_title = item['snippet']['title'] published_at = item['contentDetails']['videoPublishedAt'] videos.append({ 'videoId': video_id, 'videoTitle': video_title, 'publishedAt': published_at }) if 'nextPageToken' in result: params['pageToken'] = result['nextPageToken'] time.sleep(0.5) else: break else: print(f"Error retrieving videos: {result}") break return videos def main(): collect_travel_channels(API_KEY, SEARCH_KEYWORDS) collect_channel_videos(API_KEY) print("Data collection complete.") if __name__ == '__main__': main()
DEV Community

Fetch Youtube video transcripts of travel creators

Top comments (0)