Skip to content

Instantly share code, notes, and snippets.

@sazio
Created August 28, 2020 14:20
Show Gist options
  • Save sazio/e4afeb14469491c9148bc349c701675f to your computer and use it in GitHub Desktop.
Save sazio/e4afeb14469491c9148bc349c701675f to your computer and use it in GitHub Desktop.
# Extract some meta-data
if EXTRACT_META == True:
results = []
subfolder = VIDEOS_FOLDER_TRAIN
filepaths = glob.glob(subfolder + "/*.mp4")
for filepath in tqdm(filepaths):
js = ffprobe(filepath)
if js:
results.append(
(js.get("format", {}).get("filename")[len(subfolder) + 1:],
js.get("format", {}).get("format_long_name"),
# Video
js.get("streams", [{}, {}])[0].get("codec_name"),
js.get("streams", [{}, {}])[0].get("height"),
js.get("streams", [{}, {}])[0].get("width"),
js.get("streams", [{}, {}])[0].get("nb_frames"),
js.get("streams", [{}, {}])[0].get("bit_rate"),
js.get("streams", [{}, {}])[0].get("duration"),
js.get("streams", [{}, {}])[0].get("start_time"),
js.get("streams", [{}, {}])[0].get("avg_frame_rate"),
# Audio
js.get("streams", [{}, {}])[1].get("codec_name"),
js.get("streams", [{}, {}])[1].get("channels"),
js.get("streams", [{}, {}])[1].get("sample_rate"),
js.get("streams", [{}, {}])[1].get("nb_frames"),
js.get("streams", [{}, {}])[1].get("bit_rate"),
js.get("streams", [{}, {}])[1].get("duration"),
js.get("streams", [{}, {}])[1].get("start_time")),
)
meta_pd = pd.DataFrame(results, columns=["filename", "format", "video_codec_name", "video_height", "video_width",
"video_nb_frames", "video_bit_rate", "video_duration", "video_start_time","video_fps",
"audio_codec_name", "audio_channels", "audio_sample_rate", "audio_nb_frames",
"audio_bit_rate", "audio_duration", "audio_start_time"])
meta_pd["video_fps"] = meta_pd["video_fps"].apply(lambda x: float(x.split("/")[0])/float(x.split("/")[1]) if len(x.split("/")) == 2 else None)
meta_pd["video_duration"] = meta_pd["video_duration"].astype(np.float32)
meta_pd["video_bit_rate"] = meta_pd["video_bit_rate"].astype(np.float32)
meta_pd["video_start_time"] = meta_pd["video_start_time"].astype(np.float32)
meta_pd["video_nb_frames"] = meta_pd["video_nb_frames"].astype(np.float32)
meta_pd["video_bit_rate"] = meta_pd["video_bit_rate"].astype(np.float32)
meta_pd["audio_sample_rate"] = meta_pd["audio_sample_rate"].astype(np.float32)
meta_pd["audio_nb_frames"] = meta_pd["audio_nb_frames"].astype(np.float32)
meta_pd["audio_bit_rate"] = meta_pd["audio_bit_rate"].astype(np.float32)
meta_pd["audio_duration"] = meta_pd["audio_duration"].astype(np.float32)
meta_pd["audio_start_time"] = meta_pd["audio_start_time"].astype(np.float32)
meta_pd.to_pickle(HOME + "videos_meta.pkl")
else:
meta_pd = pd.read_pickle(HOME + "videos_meta.pkl")
meta_pd.head()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment