Created
March 1, 2017 17:13
-
-
Save dchrostowski/27c45a6fb649dcbe43110c57ddd6af85 to your computer and use it in GitHub Desktop.
Morningstar transcript crawler component
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env perl | |
package Quan::Worker::ConferenceCallTranscripts::Morningstar; | |
use Moose; | |
extends 'Quan::Worker::ConferenceCallTranscripts'; | |
use Method::Signatures; | |
use URI; | |
use Crawler; | |
use JSON qw(encode_json decode_json); | |
use Data::Dumper; | |
use FindBin qw($Bin); | |
use HTML::TreeBuilder::XPath; | |
has worker_name => (is => 'rw', isa => 'Str', default => 'morningstar.com_transcripts'); | |
has job_count => (is => 'ro', isa => 'Int', default => sub { return int(rand(9) + 3); } ); | |
method fix_url($url) { | |
my ($transcript_id) = $url =~ /morningstar\.com\/earnings\/(\d+)-/; | |
my $new_url = URI->new("http://www.morningstar.com/earnings/Transcript.aspx"); | |
$new_url->query_form(id => $transcript_id); | |
return $new_url; | |
} | |
# Parent class does all the queue loading, this is just implemented so the grandparent class | |
# doesn't yell at you for failing to implement the method. | |
method load_queue { | |
return 0; | |
} | |
# Override | |
method fetch_crawl_jobs($sel) { | |
my $sth = $self->dbh->prepare($sel); | |
$sth->execute($self->worker_id, $self->job_count) or die DBI::errstr; | |
my @jobs = (); | |
while(my ($id, $data) = $sth->fetchrow_array) { | |
$data = decode_json($data); | |
$data->{crawl_queue_id} = $id; | |
push @jobs, $data; | |
} | |
return \@jobs; | |
} | |
method extract_transcript($tree, $resp, $url, $headline, $job) { | |
my @pre_content = ( $headline ); | |
my ($call_date) = $tree->findvalues('.//div[@id="pres"]/div[1]/text()'); | |
$call_date =~ s/[^\d\/]//g; | |
$call_date = "Transcript call date: $call_date"; | |
push @pre_content, $call_date; | |
# From ClientPager.js - dynamically sets up pagination at bottom. Function is called from DOM. | |
# function ClientPager(pageSize, totalCount, varName, pageIndex, queryStringName, archivePage) | |
my ($last_page) = $resp =~ /var\s+pdownpager\s+=\s+new ClientPager\(1,\s*(\d+),\s*'pdownpager',\s*null/i; | |
my @content = $tree->findvalues('.//div[@id="pres"]//p/descendant-or-self::*/text()'); | |
# Iterate through pages, retrying up to 3 times to get a page if needed | |
# The proxy queues haven't learned enough yet, so there will be less retries once the crawler fully cycles | |
# through all available proxies. | |
my $retry_count = 0; | |
for(my $i=2; $i <= $last_page; $i++) { | |
my $page = $i; | |
my $clone_url = $url; | |
my %params = $clone_url->query_form; | |
my %pindex_param = (pindex => $page); | |
%params = (%params, %pindex_param); | |
$clone_url->query_form(%params); | |
my $page_tree = $self->crawler->get($clone_url); | |
if(!$self->success && $retry_count < 6) { | |
$self->crawler->proxy($self->get_proxy); | |
$i--; | |
$retry_count++; | |
next; | |
} | |
elsif(!$self->success && $retry_count >=6) { | |
return; | |
} | |
my @page_content; | |
eval { @page_content = $page_tree->findvalues('.//div[@id="pres"]//p/descendant-or-self::*/text()'); }; | |
if(@page_content < 1 ) { | |
my ($headline_loc) = $tree->findvalues('.//head/title/text()'); | |
if(defined $headline_loc && $headline_loc eq $headline && $headline_loc !~ /no transcript data/i) { | |
warn "headline should not be No Transcript Data:\n$headline_loc\n"; | |
} | |
if($retry_count < 6) { | |
$self->crawler->proxy($self->get_proxy); | |
$i--; | |
$retry_count++; | |
next; | |
} | |
else { | |
warn "Could not get content from page $page returning.\n"; | |
return if $headline =~ /no transcript data/i; | |
} | |
} | |
@content = (@content, @page_content); | |
# Sleep some random amount of time before requesting the next page. | |
sleep(int(rand(7)+5)); | |
} | |
@content = (@pre_content, @content); | |
$self->update_link_history($job->{link}); | |
my $dir = $self->base_transcript_dir . '/' . $job->{security_id} . '/'; | |
my $quarter = $self->determine_quarter($headline); | |
my $filename; | |
if(defined $quarter && defined $quarter->{quarter} && defined $quarter->{year}) { | |
$dir .= $quarter->{year} . '/' . $quarter->{quarter}; | |
$filename = 'morningstar.txt'; | |
} | |
else { | |
my $link = $job->{link}; | |
$_ = $link; | |
s/http\:\/\///; | |
s/\//\-\-/g; | |
$filename = "$_.txt"; | |
} | |
# Create a new directory structure if one doesn't yet exist. | |
if(! -e $dir) { | |
system("mkdir -p $dir"); | |
} | |
my $full_file_path = "$dir/$filename"; | |
print "\n\n\nWRITING TO $full_file_path\n\n\n"; | |
$self->write_transcript($full_file_path, \@content); | |
} | |
method crawl { | |
my @jobs = @{$self->crawl_jobs}; | |
if(@jobs < 1) { | |
print "No jobs in the queue, sleep for 5 minutes.\n"; | |
sleep(5*60); | |
return; | |
} | |
my $link_history = $self->link_history; | |
foreach my $job (@jobs) { | |
$self->crawler->proxy($self->get_proxy); | |
$self->crawler->new_user_agent; | |
my $mech = $self->crawler->mech; | |
if($link_history->{$job->{link}}) { | |
warn "Already visited link, skipping and dequeueing job\n"; | |
$self->dequeue_job($job->{crawl_queue_id}); | |
next; | |
} | |
my $custom_headers = { | |
'Host' => 'www.morningstar.com', | |
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
'Accept-Language' => 'en-US;en;q=0.5', | |
'Accept-Encoding' => 'gzip,deflate', | |
'DNT' => '1', | |
'Connection' => 'keep-alive', | |
'Upgrade-Insecure-Requests' => 1, | |
}; | |
# For some reason HTML::TreeBuilder::XPath strips out <script> tag content which we need | |
# So we must manually override Crawler.pm's behavior | |
$mech->proxy(['http','https'] => $self->crawler->proxy->to_string); | |
my $url = $self->fix_url($job->{link}); | |
# $req_as_string shows request headers | |
print "getting $url with proxy " . $self->crawler->proxy->to_string . "\n"; | |
my $req_as_string = $mech->get($url, %$custom_headers) | |
->request->as_string(); | |
if(!$mech->success) { | |
$self->crawler->proxy->bad; | |
$self->crawler->proxy->update(cache => 1); | |
next; | |
} | |
my $resp = $mech->content(); | |
my $tree = HTML::TreeBuilder::XPath->new_from_content($resp); | |
my ($headline) = $tree->findvalues('.//head/title/text()'); | |
# Try reverting to original link in case it randomly redirects to the "no transcript data" page. | |
# Again, major pain to do all this handling manually, but it won't work in Crawler.pm | |
# We need the content in <script>here</script> to determine pagination length | |
if($headline =~ /no transcript data/i) { | |
my $new_proxy = $self->get_proxy; | |
$mech->proxy(['http','https'] => $new_proxy->to_string); | |
$url = URI->new($job->{link}); | |
$req_as_string = $mech | |
->get($url, %$custom_headers) | |
->request->as_string(); | |
if($mech->success) { | |
$resp = $mech->content(); | |
$tree = HTML::TreeBuilder::XPath->new_from_content($resp); | |
($headline) = $tree->findvalues('.//head/title/text()'); | |
} | |
} | |
if($self->success && $headline !~ /no transcript data/i) { | |
$self->proxy_queue_response_good; | |
$self->extract_transcript($tree, $resp, $url, $headline, $job); | |
} | |
else { | |
# Crawler.pm usually takes care of this. | |
$self->crawler->proxy->bad; | |
$self->crawler->proxy->update(cache => 1); | |
} | |
} | |
$self->write_link_history; | |
} | |
1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment