Skip to content

Instantly share code, notes, and snippets.

@miraris
Last active May 4, 2018 15:22
Show Gist options
  • Save miraris/9492e14a2a7623f1f0cbdd31e78636e6 to your computer and use it in GitHub Desktop.
Save miraris/9492e14a2a7623f1f0cbdd31e78636e6 to your computer and use it in GitHub Desktop.
first perl script pls no flame
#!/usr/bin/perl -w
use Data::Dumper;
use DBI;
use XML::Simple;
use Regexp::Common qw(URI);
use DateTime;
use Time::Piece;
use DateTime::Format::Pg;
use feature qw(say);
use Parse::BBCode;
use open qw/:std :utf8/;
# pgsql connection
my $dsn = "DBI:Pg:dbname = noraneko2;host = 127.0.0.1;port = 5432";
my $username = "postgres";
my $password = "";
my $dbh = DBI->connect( $dsn, $username, $password, { RaiseError => 1 } )
or die $DBI::errstr;
# Hash for anime types
%types = (
'TV Series' => 0,
'OVA' => 1,
'Movie' => 2,
'TV Special' => 3,
'Web' => 4,
'Music Video' => 5,
'unknown' => 6,
'Other' => 7
);
# Hash for title types
%title_types = ( 'main' => 0, 'synonym' => 1, 'official' => 2, 'short' => 3 );
# language codes list
# https://anidb.net/perl-bin/animedb.pl?show=dump&t=lang
%languages = (
'x-unk' => 1,
'ja' => 2,
'zh' => 3,
'en' => 4,
'de' => 5,
'fr' => 6,
'it' => 7,
'es' => 8,
'ru' => 9,
'ko' => 10,
'pl' => 11,
'sv' => 12,
'ar' => 13,
'pt' => 14,
'no' => 15,
'pt-BR' => 16,
'ca' => 17,
'cs' => 18,
'da' => 19,
'nl' => 20,
'fi' => 21,
'el' => 22,
'he' => 23,
'hu' => 24,
'x-in' => 25,
'lt' => 26,
'th' => 27,
'tr' => 28,
'zh-x-cmn' => 29,
'zh-x-yue' => 30,
'zh-x-nan' => 31,
'x-other' => 32,
'ro' => 33,
'my' => 34,
'sk' => 35,
'sl' => 36,
'tt' => 37,
'lv' => 38,
'ka' => 39,
'la' => 40,
'vi' => 41,
'id' => 42,
'ta' => 43,
'bg' => 44,
'x-jat' => 45,
'et' => 46,
'uk' => 47,
'hr' => 48,
'sr' => 49,
'zh-Hant' => 50,
'zh-Hans' => 51,
'is' => 52,
'al' => 53,
'bd' => 54,
'x-kot' => 55,
'es-LA' => 56,
'gl' => 57,
'jv' => 58,
'fa' => 59,
'tl' => 60,
'ht' => 61,
'mn' => 62,
'x-zht' => 63,
'hi' => 64,
'si' => 65,
'eo' => 66,
'ur' => 67,
'eu' => 68,
'af' => 69,
'grc' => 70,
'ne' => 71,
'bs' => 72,
'tl' => 73,
'x-tht' => 74
);
# Insert anime titles
sub insertTitles {
my $timestamp = Time::Piece->new->strftime('%Y-%m-%d %H:%M:%S');
my ( $titles, $anime_id ) = @_;
if ( ref( $titles->{title} ) eq 'ARRAY' ) {
for ( @{ $titles->{title} } ) {
my $sql =
'INSERT INTO anime_titles (title, anime_id, language_id, title_type_id, created_at) VALUES (?, ?, ?, ?, ?)';
my $sth = $dbh->prepare($sql);
$sth->execute(
$_->{content}, $anime_id,
$languages{ $_->{'xml:lang'} },
$title_types{ $_->{type} }, $timestamp
);
}
}
else {
my $sql =
'INSERT INTO anime_titles (title, anime_id, language_id, title_type_id, created_at) VALUES (?, ?, ?, ?, ?)';
my $sth = $dbh->prepare($sql);
$sth->execute(
$titles->{title}->{content},
$anime_id,
$languages{ $titles->{title}->{'xml:lang'} },
$title_types{ $titles->{title}->{type} },
$timestamp
);
}
}
# Insert episode titles
sub insertEpisodeTitle {
my $timestamp = Time::Piece->new->strftime('%Y-%m-%d %H:%M:%S');
my ( $title, $episode_id ) = @_;
my $sql =
'INSERT INTO episode_titles (title, episode_id, language_id, created_at) VALUES (?, ?, ?, ?)';
my $sth = $dbh->prepare($sql);
$sth->execute( $title->{content}, $episode_id,
$languages{ $title->{'xml:lang'} }, $timestamp );
}
# Separate function that inserts the episode shit
sub episodeQuery {
my ( $episode, $anime_id ) = @_;
$titles = $episode->{title};
my $sql =
'INSERT INTO episodes (anime_id, air_date, episode_number, episode_type_id, duration, created_at) VALUES (?, ?, ?, ?, ?, ?)';
my $sth = $dbh->prepare($sql);
$sth->execute(
$anime_id, $episode->{airdate},
$episode->{epno}->{content},
$episode->{epno}->{type},
$episode->{length}, $timestamp
) or die "died, current anime ID: $anime_id";
#ep id
my $episode_id = $dbh->last_insert_id( undef, undef, undef, undef,
{ sequence => 'episodes_id_seq' } );
# Need to check whether it's an array
if ( ref($titles) eq 'ARRAY' ) {
for my $title ( @{$titles} ) {
insertEpisodeTitle( $title, $episode_id );
}
}
else {
insertEpisodeTitle( $titles, $episode_id );
}
}
# Insert episodes
sub insertEpisodes {
my $timestamp = Time::Piece->new->strftime('%Y-%m-%d %H:%M:%S');
my ( $episodes, $anime_id ) = @_;
# the episode is one object only..
if ( $episodes->{episode}{epno} ) {
episodeQuery( $episodes->{episode}, $anime_id );
}
else {
for my $key ( keys %{ $episodes->{episode} } ) {
episodeQuery( $episodes->{episode}{$key}, $anime_id );
}
}
}
# Check whether it's a legit date
sub parseDate {
my ($date) = @_;
return ( $date =~ tr/-// == 2 ) ? $date : undef;
}
# Get the anime state
sub getState {
my ( $start_date, $end_date ) = @_;
my $now = DateTime->now->ymd;
if ( defined $start_date && $start_date lt $now ) {
return ( $end_date && $end_date lt $now ) ? 0 : 1;
}
# some weird cases ..
# when $start_date is null
if ( $end_date lt $now ) {
return 0;
}
return 2;
}
# Map anime_id <=> anidb_id
sub mapAnime {
my ( $anime_id, $anidb_id ) = @_;
my $sql = 'INSERT INTO anime_map (anime_id, anidb_id) VALUES(?, ?)';
my $sth = $dbh->prepare($sql);
$sth->execute( $anime_id, $anidb_id ) or die $DBI::errstr;
}
# Insert the anime
sub insertAnime {
my ($anime) = @_;
my $anidb_id = $anime->{id};
say("AniDB ID: $anidb_id");
# skip a test anime
if ( $anidb_id == 357 ) {
say("skipping test anime");
next;
}
# Insert the anime details here..
$synopsis = $anime->{description};
# convert their BBCode links to valid ones
# cancer but it works?
my @bad_words;
while ( $synopsis =~ /\[(.*?)\]/g ) {
my $match = $1;
unless ( $synopsis =~ /\Q\/$match/ || $match =~ '/' ) {
push( @bad_words, $match );
}
}
$synopsis =~ s{$RE{URI}{-keep}}{[url=$1]}g;
foreach my $badword (@bad_words) {
my $newString = $badword . '[/url]';
$synopsis =~ s{\s+\[$badword\]}{$newString}g;
}
$start_date = parseDate( $anime->{startdate} );
$end_date = parseDate( $anime->{enddate} );
$status = getState( $start_date, $end_date );
my $timestamp = Time::Piece->new->strftime('%Y-%m-%d %H:%M:%S');
my $sql =
'INSERT INTO anime (synopsis, start_date, end_date, state_id, type_id, episode_count, created_at)
VALUES(?, ?, ?, ?, ?, ?, ?)';
my $sth = $dbh->prepare($sql);
$sth->execute(
$synopsis, $start_date, $end_date, $status,
$types{ $anime->{type} },
$anime->{episodecount}, $timestamp
) or die $DBI::errstr;
my $anime_id = $dbh->last_insert_id( undef, undef, undef, undef,
{ sequence => 'anime_id_seq' } );
insertTitles( $anime->{titles}, $anime_id );
insertEpisodes( $anime->{episodes}, $anime_id );
mapAnime( $anime_id, $anidb_id );
}
# Simple XML instance
my $simple = XML::Simple->new();
# Loop over all of the XML files
@files = </home/miraris/Documents/anidb-dump/data/*.xml>;
# Schwartzian transform (https://en.wikipedia.org/wiki/Schwartzian_transform)
# Sorts the file names list numerically
my @sorted = map { $_->[0] }
sort { $a->[1] <=> $b->[1] }
map { [ $_, $_ =~ /(\d+)/ ] } @files;
foreach $file (@sorted) {
my $tree = $simple->XMLin($file);
insertAnime($tree);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment