Created
November 20, 2015 17:46
-
-
Save Skarsnik/00a0e619db84c4adeb32 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use v6; | |
use Gumbo; | |
use XML; | |
use HTTP::UserAgent; | |
#We define some base url, notice that I don't use https | |
my $bbaseurl = "http://www.fimfiction.net/bookshelf/"; | |
my $fimbaseurl = "http://www.fimfiction.net/"; | |
my $bookshelfid = @*ARGS[0]; | |
my $url = $bbaseurl~$bookshelfid; | |
my $ua = HTTP::UserAgent.new; | |
#Fimfiction hide mature content (violent story/sex stories) as default. | |
$ua.cookies.set-cookie('Set-Cookie:view_mature=true; '); | |
#HTTP::UserAgent get give us a response object | |
my $rep = $ua.get($url); | |
if ! $rep.is-success { | |
die "Can't contact $url" ~ $rep.status-line; | |
} | |
#First we are only interested in the number of page | |
# We could have only called parse-html($rep.content) and search on the xml tree created | |
# But parse-html provided by Gumbo offer some basic filtering, that speed up the parsing | |
# :SINGLE make it stop at the first element that match div class="page_list" | |
# :nowhitespace tell him to not add all the whitespaces that are outside elements (like identation tab) | |
my $xmldoc = parse-html($rep.content, :TAG<div>, :class<page_list>, :SINGLE, :nowhitespace); | |
# Note: $xmldoc contains the html tag as root, not the <div> | |
# We don't care for the <ul> or extra content of this div, so let get all the <li> tags | |
my @pages_li = $xmldoc.lookfor(:TAG<li>); | |
my $number_of_page = 1; | |
#if we have more than one <li> | |
if @pages_li.elems > 1 { | |
# get the text of the second last element | |
$number_of_page = @pages_li[@pages_li.elems-2][0][0].text; | |
} | |
say "Bookshelf n°$bookshelfid has $number_of_page page(s)"; | |
for 1..$number_of_page { | |
$rep = $ua.get("$url?order=date_added&page=$_"); | |
if ! $rep.is-success { | |
die "Can't get $url?order=date_added&page=$_"; | |
} | |
# No :SINGLE this time, since we want all the story_content_box div | |
my $stories_div = parse-html($rep.content, :TAG<div>, :class<story_content_box>, :nowhitespace); | |
for $stories_div.root.nodes -> $story_div { | |
my %story; | |
# :SINGLE make lookfor returns a XML::Element instead of an Array of it | |
%story<title> = $story_div.lookfor(:TAG<a>, :class<story_name>, :SINGLE)[0].text; | |
# Author name is the text of a link with fancy stuff around the a tag, in an author div | |
%story<author> = $story_div.lookfor(:TAG<div>, :class<author>, :SINGLE).lookfor(:TAG<a>, :SINGLE)[0].text; | |
# Tags are more tricky, but regex are here to the rescue | |
my $description_div = $story_div.lookfor(:TAG<div>, :class<description>, :SINGLE); | |
my @tags = $description_div.lookfor(:TAG<a>, :class(/^story_category/)); | |
for @tags -> $atag { | |
%story<tags>.push($atag[0].text); | |
} | |
# Character name can be found on their character clickable image | |
my $extradiv = $story_div.lookfor(:TAG<div>, :class<extra_story_data>, :SINGLE); | |
my @charactera = $extradiv.lookfor(:TAG<a>, :class<character_icon>); | |
for @charactera -> $aelem { | |
# Accessing one attribute | |
%story<character_tags>.push($aelem<title>); | |
} | |
say "%story<title> by %story<author> "~%story<tags>.gist~" involving : "~%story<character_tags>.join(', '); | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment