|
| 1 | +# ------------------------------------------------------------------------------ |
| 2 | +# Copyright (c) ETRI. All rights reserved. |
| 3 | +# Licensed under the BSD 3-Clause License. |
| 4 | +# This file is part of Youtube-Gesture-Dataset, a sub-project of AIR(AI for Robots) project. |
| 5 | +# You can refer to details of AIR project at https://aiforrobots.github.io |
| 6 | +# Written by Youngwoo Yoon (youngwoo@etri.re.kr) |
| 7 | +# ------------------------------------------------------------------------------ |
| 8 | + |
| 9 | +import os |
| 10 | + |
| 11 | +from tqdm import tqdm_gui |
| 12 | +import unicodedata |
| 13 | + |
| 14 | +from data_utils import * |
| 15 | + |
| 16 | + |
| 17 | +def read_subtitle(vid): |
| 18 | + postfix_in_filename = '-en.vtt' |
| 19 | + file_list = glob.glob(my_config.SUBTITLE_PATH + '/*' + vid + postfix_in_filename) |
| 20 | + if len(file_list) > 1: |
| 21 | + print('more than one subtitle. check this.', file_list) |
| 22 | + assert False |
| 23 | + if len(file_list) == 1: |
| 24 | + return WebVTT().read(file_list[0]) |
| 25 | + else: |
| 26 | + return [] |
| 27 | + |
| 28 | + |
| 29 | +# turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427 |
| 30 | +def unicode_to_ascii(s): |
| 31 | + return ''.join( |
| 32 | + c for c in unicodedata.normalize('NFD', s) |
| 33 | + if unicodedata.category(c) != 'Mn' |
| 34 | + ) |
| 35 | + |
| 36 | + |
| 37 | +# lowercase, trim, and remove non-letter characters |
| 38 | +def normalize_string(s): |
| 39 | + s = unicode_to_ascii(s.lower().strip()) |
| 40 | + s = re.sub(r"([,.!?])", r" \1 ", s) # isolate some marks |
| 41 | + s = re.sub(r"(['])", r"", s) # remove apostrophe |
| 42 | + s = re.sub(r"[^a-zA-Z,.!?]+", r" ", s) # replace other characters with whitespace |
| 43 | + s = re.sub(r"\s+", r" ", s).strip() |
| 44 | + return s |
| 45 | + |
| 46 | + |
| 47 | +def normalize_subtitle(vtt_subtitle): |
| 48 | + for i, sub in enumerate(vtt_subtitle): |
| 49 | + vtt_subtitle[i].text = normalize_string(vtt_subtitle[i].text) |
| 50 | + return vtt_subtitle |
| 51 | + |
| 52 | + |
| 53 | +def make_ted_gesture_dataset(): |
| 54 | + dataset_train = [] |
| 55 | + dataset_val = [] |
| 56 | + dataset_test = [] |
| 57 | + n_saved_clips = [0, 0, 0] |
| 58 | + |
| 59 | + video_files = sorted(glob.glob(my_config.VIDEO_PATH + "/*.mp4"), key=os.path.getmtime) |
| 60 | + for v_i, video_file in enumerate(tqdm_gui(video_files)): |
| 61 | + vid = os.path.split(video_file)[1][-15:-4] |
| 62 | + print(vid) |
| 63 | + |
| 64 | + # load clip, video, and subtitle |
| 65 | + clip_data = load_clip_data(vid) |
| 66 | + if clip_data is None: |
| 67 | + print('[ERROR] clip data file does not exist!') |
| 68 | + break |
| 69 | + |
| 70 | + video_wrapper = read_video(my_config.VIDEO_PATH, vid) |
| 71 | + |
| 72 | + subtitle_type = my_config.SUBTITLE_TYPE |
| 73 | + subtitle = SubtitleWrapper(vid, subtitle_type).get() |
| 74 | + |
| 75 | + if subtitle is None: |
| 76 | + print('[WARNING] subtitle does not exist! skipping this video.') |
| 77 | + continue |
| 78 | + |
| 79 | + dataset_train.append({'vid': vid, 'clips': []}) |
| 80 | + dataset_val.append({'vid': vid, 'clips': []}) |
| 81 | + dataset_test.append({'vid': vid, 'clips': []}) |
| 82 | + |
| 83 | + word_index = 0 |
| 84 | + valid_clip_count = 0 |
| 85 | + for ia, clip in enumerate(clip_data): |
| 86 | + start_frame_no, end_frame_no, clip_pose_all = clip['clip_info'][0], clip['clip_info'][1], clip['frames'] |
| 87 | + clip_word_list = [] |
| 88 | + |
| 89 | + # skip FALSE clips |
| 90 | + if not clip['clip_info'][2]: |
| 91 | + continue |
| 92 | + |
| 93 | + # train/val/test split |
| 94 | + if valid_clip_count % 10 == 9: |
| 95 | + dataset = dataset_test |
| 96 | + dataset_idx = 2 |
| 97 | + elif valid_clip_count % 10 == 8: |
| 98 | + dataset = dataset_val |
| 99 | + dataset_idx = 1 |
| 100 | + else: |
| 101 | + dataset = dataset_train |
| 102 | + dataset_idx = 0 |
| 103 | + valid_clip_count += 1 |
| 104 | + |
| 105 | + # get subtitle that fits clip |
| 106 | + for ib in range(word_index - 1, len(subtitle)): |
| 107 | + if ib < 0: |
| 108 | + continue |
| 109 | + |
| 110 | + word_s = video_wrapper.second2frame(subtitle[ib]['start']) |
| 111 | + word_e = video_wrapper.second2frame(subtitle[ib]['end']) |
| 112 | + word = subtitle[ib]['word'] |
| 113 | + |
| 114 | + if word_s >= end_frame_no: |
| 115 | + word_index = ib |
| 116 | + break |
| 117 | + |
| 118 | + if word_e <= start_frame_no: |
| 119 | + continue |
| 120 | + |
| 121 | + word = normalize_string(word) |
| 122 | + clip_word_list.append([word, word_s, word_e]) |
| 123 | + |
| 124 | + if clip_word_list: |
| 125 | + clip_skeleton = [] |
| 126 | + |
| 127 | + # get skeletons of the upper body in the clip |
| 128 | + for frame in clip_pose_all: |
| 129 | + if frame: |
| 130 | + clip_skeleton.append(get_skeleton_from_frame(frame)[:24]) |
| 131 | + else: # frame with no skeleton |
| 132 | + clip_skeleton.append([0] * 24) |
| 133 | + |
| 134 | + # proceed if skeleton list is not empty |
| 135 | + if len(clip_skeleton) > 0: |
| 136 | + # save subtitles and skeletons corresponding to clips |
| 137 | + n_saved_clips[dataset_idx] += 1 |
| 138 | + dataset[-1]['clips'].append({'words': clip_word_list, |
| 139 | + 'skeletons': clip_skeleton, |
| 140 | + 'start_frame_no': start_frame_no, 'end_frame_no': end_frame_no, |
| 141 | + 'vid': vid |
| 142 | + }) |
| 143 | + print('{} ({}, {})'.format(vid, start_frame_no, end_frame_no)) |
| 144 | + else: |
| 145 | + print('{} ({}, {}) - consecutive missing frames'.format(vid, start_frame_no, end_frame_no)) |
| 146 | + |
| 147 | + # for debugging |
| 148 | + # if vid == 'yq3TQoMjXTw': |
| 149 | + # break |
| 150 | + |
| 151 | + print('writing to pickle...') |
| 152 | + with open('ted_gesture_dataset_train.pickle', 'wb') as f: |
| 153 | + pickle.dump(dataset_train, f) |
| 154 | + with open('ted_gesture_dataset_train_small.pickle', 'wb') as f: # for debugging |
| 155 | + pickle.dump(dataset_train[0:10], f) |
| 156 | + with open('ted_gesture_dataset_val.pickle', 'wb') as f: |
| 157 | + pickle.dump(dataset_val, f) |
| 158 | + with open('ted_gesture_dataset_test.pickle', 'wb') as f: |
| 159 | + pickle.dump(dataset_test, f) |
| 160 | + |
| 161 | + print('no. of saved clips: train {}, val {}, test {}'.format(n_saved_clips[0], n_saved_clips[1], n_saved_clips[2])) |
| 162 | + |
| 163 | + |
| 164 | +if __name__ == '__main__': |
| 165 | + make_ted_gesture_dataset() |
0 commit comments