Last active
July 23, 2016 17:58
-
-
Save zengargoyle/5cdc234efeaf89fee3dd44a8ec1b5520 to your computer and use it in GitHub Desktop.
metafilter youtube video snarfer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Metafilter youtube link snarfer | |
Requires: | |
Perl - :) | |
Mojolicious - http://mojolicious.org/ - Your distro probably has a package | |
for this (in Debian it's libmojolicious-perl), otherwise it's easy to | |
install. | |
youtube-dl - https://rg3.github.io/youtube-dl/ - need I say more. | |
Usage: | |
$ ./meta-yt-snarf.pl http://www.metafilter.com/161149/In-Celebration-Of-The-80s-12-Remix | |
This will create a 'cached_urls' file with the youtube videos from the | |
requested url. You can run it again later to add newly posted videos. | |
Inspect the 'cached_urls' file and comment out the videos you don't want | |
to download by putting a '#' at the beginning of the line. | |
$ ./fetcher.pl | |
This will use 'youtube-dl' to fetch and extract the audio of all of the | |
un-commented-out videos in the 'cached_urls' file. It will create a | |
'fetched_urls' file containing the processed videos and won't process them | |
again the next time it's run. | |
Example Session: | |
$ rm -f cached_urls | |
$ rm -f fetched_urls | |
$ ./meta-yt-snarf.pl http://www.metafilter.com/161149/In-Celebration-Of-The-80s-12-Remix | |
$ wc -l cached_urls | |
129 cached_urls | |
$ $EDITOR cached_urls # comment out unwanted videos | |
$ ./fetcher.pl | |
[youtube] 88mTffVGAT4: Downloading webpage | |
[youtube] 88mTffVGAT4: Downloading video info webpage | |
[youtube] 88mTffVGAT4: Extracting video information | |
[download] Destination: King Crimson - Sleepless (Dance Mix)-88mTffVGAT4.webm | |
[download] 100% of 6.86MiB in 00:14 | |
[ffmpeg] Destination: King Crimson - Sleepless (Dance Mix)-88mTffVGAT4.opus | |
Deleting original file King Crimson - Sleepless (Dance Mix)-88mTffVGAT4.webm (pass -k to keep) | |
$ ls | |
cached_urls | |
Daryl Hall & John Oates - Maneater (Extended Club Mix) 1982-8sh5IxYy7DQ.ogg | |
fetched_urls | |
fetcher.pl | |
King Crimson - Sleepless (Dance Mix)-88mTffVGAT4.opus | |
meta-yt-snarf.pl | |
$ egrep -v '^#' cached_urls | |
8sh5IxYy7DQ Maneater (Special Extended Club Mix) | |
88mTffVGAT4 KING CRIMSON DANCE MIX | |
$ cat fetched_urls | |
8sh5IxYy7DQ | |
88mTffVGAT4 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use 5.016; use warnings; use autodie; | |
use utf8; | |
use open qw(:std :utf8); | |
my $cache_file = 'cached_urls'; | |
my $fetched_file = 'fetched_urls'; | |
{ open my $fh, '>>', $fetched_file; close $fh } # touch | |
# | |
# Things we've already fetched | |
# | |
my %fetched; | |
{ | |
open my $fh, '<', $fetched_file; | |
while (<$fh>) { | |
chomp; | |
$fetched{$_} = 1; | |
} | |
} | |
# | |
# Non-commented-out things from the cache file we want to fetch | |
# | |
my @to_fetch; | |
{ | |
open my $fh, '<', $cache_file; | |
while (<$fh>) { | |
next if /^#/; # comment out things you dont' want to fetch! | |
my ($vid) = split; | |
next if exists $fetched{$vid}; | |
push @to_fetch, $vid; | |
} | |
} | |
# | |
# Swiss army knife | |
# | |
{ | |
open my $done, '>>', $fetched_file; | |
for (@to_fetch) { | |
system 'youtube-dl', '-x', $_ or print {$done} "$_\n"; | |
} | |
close $done; | |
} | |
exit; | |
__END__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use 5.016; use warnings; use autodie; | |
use utf8; | |
use open qw(:std :utf8); | |
use Mojo::UserAgent; | |
my $cache_file = 'cached_urls'; | |
my $source_url = shift || 'http://www.metafilter.com/161149/In-Celebration-Of-The-80s-12-Remix'; | |
{ open my $fh, '>>', $cache_file; close $fh } # touch | |
my $links = Mojo::UserAgent->new->max_redirects(5) | |
->get($source_url)->res->dom->find('a[href]') | |
->grep(sub { $_[0]->{href} =~ /youtu/ }) | |
->map( sub { | |
my $url = Mojo::URL->new($_[0]->{href}); | |
if ($url->host =~ /\byoutube\.com$/) { | |
return unless $url->query->param('v'); | |
[ $url->query->param('v'), $_[0]->text ]; | |
} | |
elsif ($url->host =~ /youtu\.be$/) { | |
[ substr($url->path,1), $_[0]->text ]; | |
} | |
else { | |
[ '#error', "unknown url type: $url" ] | |
} | |
} ); | |
# say "@$_" for $links->each; | |
# exit; | |
my %cache; | |
open my $in, '<', $cache_file; | |
while (<$in>) { | |
s/^#\s*//; # can comment out to prevent fetching | |
my ($url, $title) = split ' ', $_, 2; | |
$cache{$url} = $title; | |
} | |
close $in; | |
my @fresh; | |
for ($links->each) { | |
next if exists $cache{$_->[0]}; | |
push @fresh, $_; | |
$cache{$_->[0]} = 1; # avoid duplicates in fresh links | |
} | |
open my $out, '>>', $cache_file; | |
print {$out} "@$_\n" for @fresh; | |
close $out; | |
__END__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment