Last active
May 4, 2018 15:22
-
-
Save miraris/9492e14a2a7623f1f0cbdd31e78636e6 to your computer and use it in GitHub Desktop.
first perl script pls no flame
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use Data::Dumper; | |
use DBI; | |
use XML::Simple; | |
use Regexp::Common qw(URI); | |
use DateTime; | |
use Time::Piece; | |
use DateTime::Format::Pg; | |
use feature qw(say); | |
use Parse::BBCode; | |
use open qw/:std :utf8/; | |
# pgsql connection | |
my $dsn = "DBI:Pg:dbname = noraneko2;host = 127.0.0.1;port = 5432"; | |
my $username = "postgres"; | |
my $password = ""; | |
my $dbh = DBI->connect( $dsn, $username, $password, { RaiseError => 1 } ) | |
or die $DBI::errstr; | |
# Hash for anime types | |
%types = ( | |
'TV Series' => 0, | |
'OVA' => 1, | |
'Movie' => 2, | |
'TV Special' => 3, | |
'Web' => 4, | |
'Music Video' => 5, | |
'unknown' => 6, | |
'Other' => 7 | |
); | |
# Hash for title types | |
%title_types = ( 'main' => 0, 'synonym' => 1, 'official' => 2, 'short' => 3 ); | |
# language codes list | |
# https://anidb.net/perl-bin/animedb.pl?show=dump&t=lang | |
%languages = ( | |
'x-unk' => 1, | |
'ja' => 2, | |
'zh' => 3, | |
'en' => 4, | |
'de' => 5, | |
'fr' => 6, | |
'it' => 7, | |
'es' => 8, | |
'ru' => 9, | |
'ko' => 10, | |
'pl' => 11, | |
'sv' => 12, | |
'ar' => 13, | |
'pt' => 14, | |
'no' => 15, | |
'pt-BR' => 16, | |
'ca' => 17, | |
'cs' => 18, | |
'da' => 19, | |
'nl' => 20, | |
'fi' => 21, | |
'el' => 22, | |
'he' => 23, | |
'hu' => 24, | |
'x-in' => 25, | |
'lt' => 26, | |
'th' => 27, | |
'tr' => 28, | |
'zh-x-cmn' => 29, | |
'zh-x-yue' => 30, | |
'zh-x-nan' => 31, | |
'x-other' => 32, | |
'ro' => 33, | |
'my' => 34, | |
'sk' => 35, | |
'sl' => 36, | |
'tt' => 37, | |
'lv' => 38, | |
'ka' => 39, | |
'la' => 40, | |
'vi' => 41, | |
'id' => 42, | |
'ta' => 43, | |
'bg' => 44, | |
'x-jat' => 45, | |
'et' => 46, | |
'uk' => 47, | |
'hr' => 48, | |
'sr' => 49, | |
'zh-Hant' => 50, | |
'zh-Hans' => 51, | |
'is' => 52, | |
'al' => 53, | |
'bd' => 54, | |
'x-kot' => 55, | |
'es-LA' => 56, | |
'gl' => 57, | |
'jv' => 58, | |
'fa' => 59, | |
'tl' => 60, | |
'ht' => 61, | |
'mn' => 62, | |
'x-zht' => 63, | |
'hi' => 64, | |
'si' => 65, | |
'eo' => 66, | |
'ur' => 67, | |
'eu' => 68, | |
'af' => 69, | |
'grc' => 70, | |
'ne' => 71, | |
'bs' => 72, | |
'tl' => 73, | |
'x-tht' => 74 | |
); | |
# Insert anime titles | |
sub insertTitles { | |
my $timestamp = Time::Piece->new->strftime('%Y-%m-%d %H:%M:%S'); | |
my ( $titles, $anime_id ) = @_; | |
if ( ref( $titles->{title} ) eq 'ARRAY' ) { | |
for ( @{ $titles->{title} } ) { | |
my $sql = | |
'INSERT INTO anime_titles (title, anime_id, language_id, title_type_id, created_at) VALUES (?, ?, ?, ?, ?)'; | |
my $sth = $dbh->prepare($sql); | |
$sth->execute( | |
$_->{content}, $anime_id, | |
$languages{ $_->{'xml:lang'} }, | |
$title_types{ $_->{type} }, $timestamp | |
); | |
} | |
} | |
else { | |
my $sql = | |
'INSERT INTO anime_titles (title, anime_id, language_id, title_type_id, created_at) VALUES (?, ?, ?, ?, ?)'; | |
my $sth = $dbh->prepare($sql); | |
$sth->execute( | |
$titles->{title}->{content}, | |
$anime_id, | |
$languages{ $titles->{title}->{'xml:lang'} }, | |
$title_types{ $titles->{title}->{type} }, | |
$timestamp | |
); | |
} | |
} | |
# Insert episode titles | |
sub insertEpisodeTitle { | |
my $timestamp = Time::Piece->new->strftime('%Y-%m-%d %H:%M:%S'); | |
my ( $title, $episode_id ) = @_; | |
my $sql = | |
'INSERT INTO episode_titles (title, episode_id, language_id, created_at) VALUES (?, ?, ?, ?)'; | |
my $sth = $dbh->prepare($sql); | |
$sth->execute( $title->{content}, $episode_id, | |
$languages{ $title->{'xml:lang'} }, $timestamp ); | |
} | |
# Separate function that inserts the episode shit | |
sub episodeQuery { | |
my ( $episode, $anime_id ) = @_; | |
$titles = $episode->{title}; | |
my $sql = | |
'INSERT INTO episodes (anime_id, air_date, episode_number, episode_type_id, duration, created_at) VALUES (?, ?, ?, ?, ?, ?)'; | |
my $sth = $dbh->prepare($sql); | |
$sth->execute( | |
$anime_id, $episode->{airdate}, | |
$episode->{epno}->{content}, | |
$episode->{epno}->{type}, | |
$episode->{length}, $timestamp | |
) or die "died, current anime ID: $anime_id"; | |
#ep id | |
my $episode_id = $dbh->last_insert_id( undef, undef, undef, undef, | |
{ sequence => 'episodes_id_seq' } ); | |
# Need to check whether it's an array | |
if ( ref($titles) eq 'ARRAY' ) { | |
for my $title ( @{$titles} ) { | |
insertEpisodeTitle( $title, $episode_id ); | |
} | |
} | |
else { | |
insertEpisodeTitle( $titles, $episode_id ); | |
} | |
} | |
# Insert episodes | |
sub insertEpisodes { | |
my $timestamp = Time::Piece->new->strftime('%Y-%m-%d %H:%M:%S'); | |
my ( $episodes, $anime_id ) = @_; | |
# the episode is one object only.. | |
if ( $episodes->{episode}{epno} ) { | |
episodeQuery( $episodes->{episode}, $anime_id ); | |
} | |
else { | |
for my $key ( keys %{ $episodes->{episode} } ) { | |
episodeQuery( $episodes->{episode}{$key}, $anime_id ); | |
} | |
} | |
} | |
# Check whether it's a legit date | |
sub parseDate { | |
my ($date) = @_; | |
return ( $date =~ tr/-// == 2 ) ? $date : undef; | |
} | |
# Get the anime state | |
sub getState { | |
my ( $start_date, $end_date ) = @_; | |
my $now = DateTime->now->ymd; | |
if ( defined $start_date && $start_date lt $now ) { | |
return ( $end_date && $end_date lt $now ) ? 0 : 1; | |
} | |
# some weird cases .. | |
# when $start_date is null | |
if ( $end_date lt $now ) { | |
return 0; | |
} | |
return 2; | |
} | |
# Map anime_id <=> anidb_id | |
sub mapAnime { | |
my ( $anime_id, $anidb_id ) = @_; | |
my $sql = 'INSERT INTO anime_map (anime_id, anidb_id) VALUES(?, ?)'; | |
my $sth = $dbh->prepare($sql); | |
$sth->execute( $anime_id, $anidb_id ) or die $DBI::errstr; | |
} | |
# Insert the anime | |
sub insertAnime { | |
my ($anime) = @_; | |
my $anidb_id = $anime->{id}; | |
say("AniDB ID: $anidb_id"); | |
# skip a test anime | |
if ( $anidb_id == 357 ) { | |
say("skipping test anime"); | |
next; | |
} | |
# Insert the anime details here.. | |
$synopsis = $anime->{description}; | |
# convert their BBCode links to valid ones | |
# cancer but it works? | |
my @bad_words; | |
while ( $synopsis =~ /\[(.*?)\]/g ) { | |
my $match = $1; | |
unless ( $synopsis =~ /\Q\/$match/ || $match =~ '/' ) { | |
push( @bad_words, $match ); | |
} | |
} | |
$synopsis =~ s{$RE{URI}{-keep}}{[url=$1]}g; | |
foreach my $badword (@bad_words) { | |
my $newString = $badword . '[/url]'; | |
$synopsis =~ s{\s+\[$badword\]}{$newString}g; | |
} | |
$start_date = parseDate( $anime->{startdate} ); | |
$end_date = parseDate( $anime->{enddate} ); | |
$status = getState( $start_date, $end_date ); | |
my $timestamp = Time::Piece->new->strftime('%Y-%m-%d %H:%M:%S'); | |
my $sql = | |
'INSERT INTO anime (synopsis, start_date, end_date, state_id, type_id, episode_count, created_at) | |
VALUES(?, ?, ?, ?, ?, ?, ?)'; | |
my $sth = $dbh->prepare($sql); | |
$sth->execute( | |
$synopsis, $start_date, $end_date, $status, | |
$types{ $anime->{type} }, | |
$anime->{episodecount}, $timestamp | |
) or die $DBI::errstr; | |
my $anime_id = $dbh->last_insert_id( undef, undef, undef, undef, | |
{ sequence => 'anime_id_seq' } ); | |
insertTitles( $anime->{titles}, $anime_id ); | |
insertEpisodes( $anime->{episodes}, $anime_id ); | |
mapAnime( $anime_id, $anidb_id ); | |
} | |
# Simple XML instance | |
my $simple = XML::Simple->new(); | |
# Loop over all of the XML files | |
@files = </home/miraris/Documents/anidb-dump/data/*.xml>; | |
# Schwartzian transform (https://en.wikipedia.org/wiki/Schwartzian_transform) | |
# Sorts the file names list numerically | |
my @sorted = map { $_->[0] } | |
sort { $a->[1] <=> $b->[1] } | |
map { [ $_, $_ =~ /(\d+)/ ] } @files; | |
foreach $file (@sorted) { | |
my $tree = $simple->XMLin($file); | |
insertAnime($tree); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment