Skip to content

Instantly share code, notes, and snippets.

@dchrostowski
Created March 1, 2017 17:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dchrostowski/27c45a6fb649dcbe43110c57ddd6af85 to your computer and use it in GitHub Desktop.
Save dchrostowski/27c45a6fb649dcbe43110c57ddd6af85 to your computer and use it in GitHub Desktop.
Morningstar transcript crawler component
#! /usr/bin/env perl
package Quan::Worker::ConferenceCallTranscripts::Morningstar;
use Moose;
extends 'Quan::Worker::ConferenceCallTranscripts';
use Method::Signatures;
use URI;
use Crawler;
use JSON qw(encode_json decode_json);
use Data::Dumper;
use FindBin qw($Bin);
use HTML::TreeBuilder::XPath;
has worker_name => (is => 'rw', isa => 'Str', default => 'morningstar.com_transcripts');
has job_count => (is => 'ro', isa => 'Int', default => sub { return int(rand(9) + 3); } );
method fix_url($url) {
my ($transcript_id) = $url =~ /morningstar\.com\/earnings\/(\d+)-/;
my $new_url = URI->new("http://www.morningstar.com/earnings/Transcript.aspx");
$new_url->query_form(id => $transcript_id);
return $new_url;
}
# Parent class does all the queue loading, this is just implemented so the grandparent class
# doesn't yell at you for failing to implement the method.
method load_queue {
return 0;
}
# Override
method fetch_crawl_jobs($sel) {
my $sth = $self->dbh->prepare($sel);
$sth->execute($self->worker_id, $self->job_count) or die DBI::errstr;
my @jobs = ();
while(my ($id, $data) = $sth->fetchrow_array) {
$data = decode_json($data);
$data->{crawl_queue_id} = $id;
push @jobs, $data;
}
return \@jobs;
}
method extract_transcript($tree, $resp, $url, $headline, $job) {
my @pre_content = ( $headline );
my ($call_date) = $tree->findvalues('.//div[@id="pres"]/div[1]/text()');
$call_date =~ s/[^\d\/]//g;
$call_date = "Transcript call date: $call_date";
push @pre_content, $call_date;
# From ClientPager.js - dynamically sets up pagination at bottom. Function is called from DOM.
# function ClientPager(pageSize, totalCount, varName, pageIndex, queryStringName, archivePage)
my ($last_page) = $resp =~ /var\s+pdownpager\s+=\s+new ClientPager\(1,\s*(\d+),\s*'pdownpager',\s*null/i;
my @content = $tree->findvalues('.//div[@id="pres"]//p/descendant-or-self::*/text()');
# Iterate through pages, retrying up to 3 times to get a page if needed
# The proxy queues haven't learned enough yet, so there will be less retries once the crawler fully cycles
# through all available proxies.
my $retry_count = 0;
for(my $i=2; $i <= $last_page; $i++) {
my $page = $i;
my $clone_url = $url;
my %params = $clone_url->query_form;
my %pindex_param = (pindex => $page);
%params = (%params, %pindex_param);
$clone_url->query_form(%params);
my $page_tree = $self->crawler->get($clone_url);
if(!$self->success && $retry_count < 6) {
$self->crawler->proxy($self->get_proxy);
$i--;
$retry_count++;
next;
}
elsif(!$self->success && $retry_count >=6) {
return;
}
my @page_content;
eval { @page_content = $page_tree->findvalues('.//div[@id="pres"]//p/descendant-or-self::*/text()'); };
if(@page_content < 1 ) {
my ($headline_loc) = $tree->findvalues('.//head/title/text()');
if(defined $headline_loc && $headline_loc eq $headline && $headline_loc !~ /no transcript data/i) {
warn "headline should not be No Transcript Data:\n$headline_loc\n";
}
if($retry_count < 6) {
$self->crawler->proxy($self->get_proxy);
$i--;
$retry_count++;
next;
}
else {
warn "Could not get content from page $page returning.\n";
return if $headline =~ /no transcript data/i;
}
}
@content = (@content, @page_content);
# Sleep some random amount of time before requesting the next page.
sleep(int(rand(7)+5));
}
@content = (@pre_content, @content);
$self->update_link_history($job->{link});
my $dir = $self->base_transcript_dir . '/' . $job->{security_id} . '/';
my $quarter = $self->determine_quarter($headline);
my $filename;
if(defined $quarter && defined $quarter->{quarter} && defined $quarter->{year}) {
$dir .= $quarter->{year} . '/' . $quarter->{quarter};
$filename = 'morningstar.txt';
}
else {
my $link = $job->{link};
$_ = $link;
s/http\:\/\///;
s/\//\-\-/g;
$filename = "$_.txt";
}
# Create a new directory structure if one doesn't yet exist.
if(! -e $dir) {
system("mkdir -p $dir");
}
my $full_file_path = "$dir/$filename";
print "\n\n\nWRITING TO $full_file_path\n\n\n";
$self->write_transcript($full_file_path, \@content);
}
method crawl {
my @jobs = @{$self->crawl_jobs};
if(@jobs < 1) {
print "No jobs in the queue, sleep for 5 minutes.\n";
sleep(5*60);
return;
}
my $link_history = $self->link_history;
foreach my $job (@jobs) {
$self->crawler->proxy($self->get_proxy);
$self->crawler->new_user_agent;
my $mech = $self->crawler->mech;
if($link_history->{$job->{link}}) {
warn "Already visited link, skipping and dequeueing job\n";
$self->dequeue_job($job->{crawl_queue_id});
next;
}
my $custom_headers = {
'Host' => 'www.morningstar.com',
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language' => 'en-US;en;q=0.5',
'Accept-Encoding' => 'gzip,deflate',
'DNT' => '1',
'Connection' => 'keep-alive',
'Upgrade-Insecure-Requests' => 1,
};
# For some reason HTML::TreeBuilder::XPath strips out <script> tag content which we need
# So we must manually override Crawler.pm's behavior
$mech->proxy(['http','https'] => $self->crawler->proxy->to_string);
my $url = $self->fix_url($job->{link});
# $req_as_string shows request headers
print "getting $url with proxy " . $self->crawler->proxy->to_string . "\n";
my $req_as_string = $mech->get($url, %$custom_headers)
->request->as_string();
if(!$mech->success) {
$self->crawler->proxy->bad;
$self->crawler->proxy->update(cache => 1);
next;
}
my $resp = $mech->content();
my $tree = HTML::TreeBuilder::XPath->new_from_content($resp);
my ($headline) = $tree->findvalues('.//head/title/text()');
# Try reverting to original link in case it randomly redirects to the "no transcript data" page.
# Again, major pain to do all this handling manually, but it won't work in Crawler.pm
# We need the content in <script>here</script> to determine pagination length
if($headline =~ /no transcript data/i) {
my $new_proxy = $self->get_proxy;
$mech->proxy(['http','https'] => $new_proxy->to_string);
$url = URI->new($job->{link});
$req_as_string = $mech
->get($url, %$custom_headers)
->request->as_string();
if($mech->success) {
$resp = $mech->content();
$tree = HTML::TreeBuilder::XPath->new_from_content($resp);
($headline) = $tree->findvalues('.//head/title/text()');
}
}
if($self->success && $headline !~ /no transcript data/i) {
$self->proxy_queue_response_good;
$self->extract_transcript($tree, $resp, $url, $headline, $job);
}
else {
# Crawler.pm usually takes care of this.
$self->crawler->proxy->bad;
$self->crawler->proxy->update(cache => 1);
}
}
$self->write_link_history;
}
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment