Skip to content

Instantly share code, notes, and snippets.

@Skarsnik
Created November 20, 2015 17:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Skarsnik/00a0e619db84c4adeb32 to your computer and use it in GitHub Desktop.
Save Skarsnik/00a0e619db84c4adeb32 to your computer and use it in GitHub Desktop.
use v6;
use Gumbo;
use XML;
use HTTP::UserAgent;
#We define some base url, notice that I don't use https
my $bbaseurl = "http://www.fimfiction.net/bookshelf/";
my $fimbaseurl = "http://www.fimfiction.net/";
my $bookshelfid = @*ARGS[0];
my $url = $bbaseurl~$bookshelfid;
my $ua = HTTP::UserAgent.new;
#Fimfiction hide mature content (violent story/sex stories) as default.
$ua.cookies.set-cookie('Set-Cookie:view_mature=true; ');
#HTTP::UserAgent get give us a response object
my $rep = $ua.get($url);
if ! $rep.is-success {
die "Can't contact $url" ~ $rep.status-line;
}
#First we are only interested in the number of page
# We could have only called parse-html($rep.content) and search on the xml tree created
# But parse-html provided by Gumbo offer some basic filtering, that speed up the parsing
# :SINGLE make it stop at the first element that match div class="page_list"
# :nowhitespace tell him to not add all the whitespaces that are outside elements (like identation tab)
my $xmldoc = parse-html($rep.content, :TAG<div>, :class<page_list>, :SINGLE, :nowhitespace);
# Note: $xmldoc contains the html tag as root, not the <div>
# We don't care for the <ul> or extra content of this div, so let get all the <li> tags
my @pages_li = $xmldoc.lookfor(:TAG<li>);
my $number_of_page = 1;
#if we have more than one <li>
if @pages_li.elems > 1 {
# get the text of the second last element
$number_of_page = @pages_li[@pages_li.elems-2][0][0].text;
}
say "Bookshelf n°$bookshelfid has $number_of_page page(s)";
for 1..$number_of_page {
$rep = $ua.get("$url?order=date_added&page=$_");
if ! $rep.is-success {
die "Can't get $url?order=date_added&page=$_";
}
# No :SINGLE this time, since we want all the story_content_box div
my $stories_div = parse-html($rep.content, :TAG<div>, :class<story_content_box>, :nowhitespace);
for $stories_div.root.nodes -> $story_div {
my %story;
# :SINGLE make lookfor returns a XML::Element instead of an Array of it
%story<title> = $story_div.lookfor(:TAG<a>, :class<story_name>, :SINGLE)[0].text;
# Author name is the text of a link with fancy stuff around the a tag, in an author div
%story<author> = $story_div.lookfor(:TAG<div>, :class<author>, :SINGLE).lookfor(:TAG<a>, :SINGLE)[0].text;
# Tags are more tricky, but regex are here to the rescue
my $description_div = $story_div.lookfor(:TAG<div>, :class<description>, :SINGLE);
my @tags = $description_div.lookfor(:TAG<a>, :class(/^story_category/));
for @tags -> $atag {
%story<tags>.push($atag[0].text);
}
# Character name can be found on their character clickable image
my $extradiv = $story_div.lookfor(:TAG<div>, :class<extra_story_data>, :SINGLE);
my @charactera = $extradiv.lookfor(:TAG<a>, :class<character_icon>);
for @charactera -> $aelem {
# Accessing one attribute
%story<character_tags>.push($aelem<title>);
}
say "%story<title> by %story<author> "~%story<tags>.gist~" involving : "~%story<character_tags>.join(', ');
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment