zengargoyle/README.txt

## README.txt

Metafilter youtube link snarfer

Requires:

Perl - :)

Mojolicious - http://mojolicious.org/ - Your distro probably has a package
for this (in Debian it's libmojolicious-perl), otherwise it's easy to
install.

youtube-dl - https://rg3.github.io/youtube-dl/ - need I say more.

Usage:

$ ./meta-yt-snarf.pl http://www.metafilter.com/161149/In-Celebration-Of-The-80s-12-Remix

This will create a 'cached_urls' file with the youtube videos from the
requested url.  You can run it again later to add newly posted videos.

Inspect the 'cached_urls' file and comment out the videos you don't want
to download by putting a '#' at the beginning of the line.

$ ./fetcher.pl

This will use 'youtube-dl' to fetch and extract the audio of all of the
un-commented-out videos in the 'cached_urls' file.  It will create a
'fetched_urls' file containing the processed videos and won't process them
again the next time it's run.


Example Session:

$ rm -f cached_urls
$ rm -f fetched_urls

$ ./meta-yt-snarf.pl http://www.metafilter.com/161149/In-Celebration-Of-The-80s-12-Remix

$ wc -l cached_urls
129 cached_urls

$ $EDITOR cached_urls   # comment out unwanted videos

$ ./fetcher.pl
[youtube] 88mTffVGAT4: Downloading webpage
[youtube] 88mTffVGAT4: Downloading video info webpage
[youtube] 88mTffVGAT4: Extracting video information
[download] Destination: King Crimson - Sleepless (Dance Mix)-88mTffVGAT4.webm
[download] 100% of 6.86MiB in 00:14
[ffmpeg] Destination: King Crimson - Sleepless (Dance Mix)-88mTffVGAT4.opus
Deleting original file King Crimson - Sleepless (Dance Mix)-88mTffVGAT4.webm (pass -k to keep)

$ ls
cached_urls
Daryl Hall & John Oates - Maneater (Extended Club Mix) 1982-8sh5IxYy7DQ.ogg
fetched_urls
fetcher.pl
King Crimson - Sleepless (Dance Mix)-88mTffVGAT4.opus
meta-yt-snarf.pl

$ egrep -v '^#' cached_urls
8sh5IxYy7DQ Maneater (Special Extended Club Mix)
88mTffVGAT4 KING CRIMSON DANCE MIX

$ cat fetched_urls
8sh5IxYy7DQ
88mTffVGAT4


## fetcher.pl
#!/usr/bin/env perl

use 5.016; use warnings; use autodie;
use utf8;
use open qw(:std :utf8);

my $cache_file = 'cached_urls';
my $fetched_file = 'fetched_urls';

{ open my $fh, '>>', $fetched_file; close $fh } # touch

#
# Things we've already fetched
#
my %fetched;
{
  open my $fh, '<', $fetched_file;
  while (<$fh>) {
    chomp;
    $fetched{$_} = 1;
  }
}

#
# Non-commented-out things from the cache file we want to fetch
#
my @to_fetch;
{
  open my $fh, '<', $cache_file;
  while (<$fh>) {
    next if /^#/;  # comment out things you dont' want to fetch!
    my ($vid) = split;
    next if exists $fetched{$vid};
    push @to_fetch, $vid;
  }
}

#
# Swiss army knife
#
{
  open my $done, '>>', $fetched_file;
  for (@to_fetch) {
    system 'youtube-dl', '-x', $_ or print {$done} "$_\n";
  }
  close $done;
}

exit;

__END__

## meta-yt-snarf.pl
#!/usr/bin/env perl

use 5.016; use warnings; use autodie;
use utf8;
use open qw(:std :utf8);
use Mojo::UserAgent;

my $cache_file = 'cached_urls';
my $source_url = shift || 'http://www.metafilter.com/161149/In-Celebration-Of-The-80s-12-Remix';

{ open my $fh, '>>', $cache_file; close $fh }  # touch

my $links = Mojo::UserAgent->new->max_redirects(5)
  ->get($source_url)->res->dom->find('a[href]')
  ->grep(sub { $_[0]->{href} =~ /youtu/ })
  ->map( sub {
      my $url = Mojo::URL->new($_[0]->{href});
      if ($url->host =~ /\byoutube\.com$/) {
        return unless $url->query->param('v');
        [ $url->query->param('v'), $_[0]->text ];
      }
      elsif ($url->host =~ /youtu\.be$/) {
        [ substr($url->path,1), $_[0]->text ];
      }
      else {
        [ '#error', "unknown url type: $url" ]
      }
    } );

# say "@$_" for $links->each;
# exit;

my %cache;
open my $in, '<', $cache_file;
while (<$in>) {
  s/^#\s*//;  # can comment out to prevent fetching
  my ($url, $title) = split ' ', $_, 2;
  $cache{$url} = $title;
}
close $in;

my @fresh;
for ($links->each) {
  next if exists $cache{$_->[0]};
  push @fresh, $_;
  $cache{$_->[0]} = 1;  # avoid duplicates in fresh links
}

open my $out, '>>', $cache_file;
print {$out} "@$_\n" for @fresh;
close $out;

__END__

	Metafilter youtube link snarfer

	Requires:

	Perl - :)

	Mojolicious - http://mojolicious.org/ - Your distro probably has a package
	for this (in Debian it's libmojolicious-perl), otherwise it's easy to
	install.

	youtube-dl - https://rg3.github.io/youtube-dl/ - need I say more.

	Usage:

	$ ./meta-yt-snarf.pl http://www.metafilter.com/161149/In-Celebration-Of-The-80s-12-Remix

	This will create a 'cached_urls' file with the youtube videos from the
	requested url. You can run it again later to add newly posted videos.

	Inspect the 'cached_urls' file and comment out the videos you don't want
	to download by putting a '#' at the beginning of the line.

	$ ./fetcher.pl

	This will use 'youtube-dl' to fetch and extract the audio of all of the
	un-commented-out videos in the 'cached_urls' file. It will create a
	'fetched_urls' file containing the processed videos and won't process them
	again the next time it's run.


	Example Session:

	$ rm -f cached_urls
	$ rm -f fetched_urls

	$ ./meta-yt-snarf.pl http://www.metafilter.com/161149/In-Celebration-Of-The-80s-12-Remix

	$ wc -l cached_urls
	129 cached_urls

	$ $EDITOR cached_urls # comment out unwanted videos

	$ ./fetcher.pl
	[youtube] 88mTffVGAT4: Downloading webpage
	[youtube] 88mTffVGAT4: Downloading video info webpage
	[youtube] 88mTffVGAT4: Extracting video information
	[download] Destination: King Crimson - Sleepless (Dance Mix)-88mTffVGAT4.webm
	[download] 100% of 6.86MiB in 00:14
	[ffmpeg] Destination: King Crimson - Sleepless (Dance Mix)-88mTffVGAT4.opus
	Deleting original file King Crimson - Sleepless (Dance Mix)-88mTffVGAT4.webm (pass -k to keep)

	$ ls
	cached_urls
	Daryl Hall & John Oates - Maneater (Extended Club Mix) 1982-8sh5IxYy7DQ.ogg
	fetched_urls
	fetcher.pl
	King Crimson - Sleepless (Dance Mix)-88mTffVGAT4.opus
	meta-yt-snarf.pl

	$ egrep -v '^#' cached_urls
	8sh5IxYy7DQ Maneater (Special Extended Club Mix)
	88mTffVGAT4 KING CRIMSON DANCE MIX

	$ cat fetched_urls
	8sh5IxYy7DQ
	88mTffVGAT4
	#!/usr/bin/env perl

	use 5.016; use warnings; use autodie;
	use utf8;
	use open qw(:std :utf8);

	my $cache_file = 'cached_urls';
	my $fetched_file = 'fetched_urls';

	{ open my $fh, '>>', $fetched_file; close $fh } # touch

	#
	# Things we've already fetched
	#
	my %fetched;
	{
	open my $fh, '<', $fetched_file;
	while (<$fh>) {
	chomp;
	$fetched{$_} = 1;
	}
	}

	#
	# Non-commented-out things from the cache file we want to fetch
	#
	my @to_fetch;
	{
	open my $fh, '<', $cache_file;
	while (<$fh>) {
	next if /^#/; # comment out things you dont' want to fetch!
	my ($vid) = split;
	next if exists $fetched{$vid};
	push @to_fetch, $vid;
	}
	}

	#
	# Swiss army knife
	#
	{
	open my $done, '>>', $fetched_file;
	for (@to_fetch) {
	system 'youtube-dl', '-x', $_ or print {$done} "$_\n";
	}
	close $done;
	}

	exit;

	__END__