dchrostowski/Morningstar.pm

## Morningstar.pm
#! /usr/bin/env perl
package Quan::Worker::ConferenceCallTranscripts::Morningstar;
use Moose;
extends 'Quan::Worker::ConferenceCallTranscripts';
use Method::Signatures;
use URI;
use Crawler;
use JSON qw(encode_json decode_json);
use Data::Dumper;
use FindBin qw($Bin);
use HTML::TreeBuilder::XPath;

has worker_name => (is => 'rw', isa => 'Str', default => 'morningstar.com_transcripts');
has job_count => (is => 'ro', isa => 'Int', default => sub { return int(rand(9) + 3); } );

method fix_url($url) {
    my ($transcript_id) = $url =~ /morningstar\.com\/earnings\/(\d+)-/;
    my $new_url = URI->new("http://www.morningstar.com/earnings/Transcript.aspx");
    $new_url->query_form(id => $transcript_id);
    return $new_url;
}

# Parent class does all the queue loading, this is just implemented so the grandparent class
# doesn't yell at you for failing to implement the method.
method load_queue {
    return 0;
}

# Override
method fetch_crawl_jobs($sel) {
    my $sth = $self->dbh->prepare($sel);
    $sth->execute($self->worker_id, $self->job_count) or die DBI::errstr;

    my @jobs = ();
    while(my ($id, $data) = $sth->fetchrow_array) {
        $data = decode_json($data);
        $data->{crawl_queue_id} = $id;
        push @jobs, $data;
    }

    return \@jobs;
}

method extract_transcript($tree, $resp, $url, $headline, $job) {
    my @pre_content = ( $headline );
    my ($call_date) = $tree->findvalues('.//div[@id="pres"]/div[1]/text()');
    $call_date =~ s/[^\d\/]//g;
    $call_date = "Transcript call date: $call_date";
    push @pre_content, $call_date;

    # From ClientPager.js - dynamically sets up pagination at bottom.  Function is called from DOM.
    # function ClientPager(pageSize, totalCount, varName, pageIndex, queryStringName, archivePage)
    my ($last_page) = $resp =~ /var\s+pdownpager\s+=\s+new ClientPager\(1,\s*(\d+),\s*'pdownpager',\s*null/i;
    my @content = $tree->findvalues('.//div[@id="pres"]//p/descendant-or-self::*/text()');

    # Iterate through pages, retrying up to 3 times to get a page if needed
    # The proxy queues haven't learned enough yet, so there will be less retries once the crawler fully cycles
    # through all available proxies.
    my $retry_count = 0;
    for(my $i=2; $i <= $last_page; $i++) {
        my $page = $i;
        my $clone_url = $url;
        my %params = $clone_url->query_form;
        my %pindex_param = (pindex => $page);
        %params = (%params, %pindex_param);
        $clone_url->query_form(%params);
        my $page_tree = $self->crawler->get($clone_url);
        if(!$self->success && $retry_count < 6) {
            $self->crawler->proxy($self->get_proxy);
            $i--;
            $retry_count++;
            next;
        }
        elsif(!$self->success && $retry_count >=6) {
            return;
        }

        my @page_content;
        eval { @page_content = $page_tree->findvalues('.//div[@id="pres"]//p/descendant-or-self::*/text()'); };
        if(@page_content < 1 ) {
            my ($headline_loc) = $tree->findvalues('.//head/title/text()');
            if(defined $headline_loc && $headline_loc eq $headline && $headline_loc !~ /no transcript data/i) {
                warn "headline should not be No Transcript Data:\n$headline_loc\n";
            }

            if($retry_count < 6) {
                $self->crawler->proxy($self->get_proxy);
                $i--;
                $retry_count++;
                next;
            }
            else {
                warn "Could not get content from page $page  returning.\n";
                return if $headline =~ /no transcript data/i;
            }
        }
        @content = (@content, @page_content);
        # Sleep some random amount of time before requesting the next page.
        sleep(int(rand(7)+5));
    }

    @content = (@pre_content, @content);
    $self->update_link_history($job->{link});
    my $dir = $self->base_transcript_dir . '/' . $job->{security_id} . '/';
    my $quarter = $self->determine_quarter($headline);
    my $filename;

    if(defined $quarter && defined $quarter->{quarter} && defined $quarter->{year}) {
        $dir .= $quarter->{year} . '/' . $quarter->{quarter};
        $filename = 'morningstar.txt';
    }
    else {
        my $link = $job->{link};
        $_ = $link;
        s/http\:\/\///;
        s/\//\-\-/g;
        $filename = "$_.txt";
    }

    # Create a new directory structure if one doesn't yet exist.
    if(! -e $dir) {
        system("mkdir -p $dir");
    }

    my $full_file_path = "$dir/$filename";
    print "\n\n\nWRITING TO $full_file_path\n\n\n";
    $self->write_transcript($full_file_path, \@content);
}


method crawl {
    my @jobs = @{$self->crawl_jobs};
    if(@jobs < 1) {
        print "No jobs in the queue, sleep for 5 minutes.\n";
        sleep(5*60);
        return;
    }

    my $link_history = $self->link_history;

    foreach my $job (@jobs) {
        $self->crawler->proxy($self->get_proxy);
        $self->crawler->new_user_agent;
        my $mech = $self->crawler->mech;

        if($link_history->{$job->{link}}) {
            warn "Already visited link, skipping and dequeueing job\n";
            $self->dequeue_job($job->{crawl_queue_id});
            next;
        }


        my $custom_headers = {
            'Host' => 'www.morningstar.com',
            'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language' => 'en-US;en;q=0.5',
            'Accept-Encoding' => 'gzip,deflate',
            'DNT' => '1',
            'Connection' => 'keep-alive',
            'Upgrade-Insecure-Requests' => 1,

        };
        # For some reason HTML::TreeBuilder::XPath strips out <script> tag content which we need
        # So we must manually override Crawler.pm's behavior
        $mech->proxy(['http','https'] => $self->crawler->proxy->to_string);
        my $url = $self->fix_url($job->{link});

        # $req_as_string shows request headers
        print "getting $url with proxy " . $self->crawler->proxy->to_string . "\n";
        my $req_as_string = $mech->get($url, %$custom_headers)
            ->request->as_string();

        if(!$mech->success) {
            $self->crawler->proxy->bad;
            $self->crawler->proxy->update(cache => 1);
            next;
        }

        my $resp = $mech->content();
        my $tree = HTML::TreeBuilder::XPath->new_from_content($resp);
        my ($headline) = $tree->findvalues('.//head/title/text()');

        # Try reverting to original link in case it randomly redirects to the "no transcript data" page.
        # Again, major pain to do all this handling manually, but it won't work in Crawler.pm
        # We need the content in <script>here</script> to determine pagination length
        if($headline =~ /no transcript data/i) {
            my $new_proxy = $self->get_proxy;
            $mech->proxy(['http','https'] => $new_proxy->to_string);
            $url = URI->new($job->{link});
            $req_as_string = $mech
              ->get($url, %$custom_headers)
              ->request->as_string();
            if($mech->success) {
                $resp = $mech->content();
                $tree = HTML::TreeBuilder::XPath->new_from_content($resp);
                ($headline) = $tree->findvalues('.//head/title/text()');
            }
        }

        if($self->success && $headline !~ /no transcript data/i) {
            $self->proxy_queue_response_good;
            $self->extract_transcript($tree, $resp, $url, $headline, $job);
        }
        else {
            # Crawler.pm usually takes care of this.
            $self->crawler->proxy->bad;
            $self->crawler->proxy->update(cache => 1);
        }

    }

    $self->write_link_history;
}

1;
	#! /usr/bin/env perl
	package Quan::Worker::ConferenceCallTranscripts::Morningstar;
	use Moose;
	extends 'Quan::Worker::ConferenceCallTranscripts';
	use Method::Signatures;
	use URI;
	use Crawler;
	use JSON qw(encode_json decode_json);
	use Data::Dumper;
	use FindBin qw($Bin);
	use HTML::TreeBuilder::XPath;

	has worker_name => (is => 'rw', isa => 'Str', default => 'morningstar.com_transcripts');
	has job_count => (is => 'ro', isa => 'Int', default => sub { return int(rand(9) + 3); } );

	method fix_url($url) {
	my ($transcript_id) = $url =~ /morningstar\.com\/earnings\/(\d+)-/;
	my $new_url = URI->new("http://www.morningstar.com/earnings/Transcript.aspx");
	$new_url->query_form(id => $transcript_id);
	return $new_url;
	}

	# Parent class does all the queue loading, this is just implemented so the grandparent class
	# doesn't yell at you for failing to implement the method.
	method load_queue {
	return 0;
	}

	# Override
	method fetch_crawl_jobs($sel) {
	my $sth = $self->dbh->prepare($sel);
	$sth->execute($self->worker_id, $self->job_count) or die DBI::errstr;

	my @jobs = ();
	while(my ($id, $data) = $sth->fetchrow_array) {
	$data = decode_json($data);
	$data->{crawl_queue_id} = $id;
	push @jobs, $data;
	}

	return \@jobs;
	}

	method extract_transcript($tree, $resp, $url, $headline, $job) {
	my @pre_content = ( $headline );
	my ($call_date) = $tree->findvalues('.//div[@id="pres"]/div[1]/text()');
	$call_date =~ s/[^\d\/]//g;
	$call_date = "Transcript call date: $call_date";
	push @pre_content, $call_date;

	# From ClientPager.js - dynamically sets up pagination at bottom. Function is called from DOM.
	# function ClientPager(pageSize, totalCount, varName, pageIndex, queryStringName, archivePage)
	my ($last_page) = $resp =~ /var\s+pdownpager\s+=\s+new ClientPager\(1,\s(\d+),\s'pdownpager',\s*null/i;
	my @content = $tree->findvalues('.//div[@id="pres"]//p/descendant-or-self::*/text()');

	# Iterate through pages, retrying up to 3 times to get a page if needed
	# The proxy queues haven't learned enough yet, so there will be less retries once the crawler fully cycles
	# through all available proxies.
	my $retry_count = 0;
	for(my $i=2; $i <= $last_page; $i++) {
	my $page = $i;
	my $clone_url = $url;
	my %params = $clone_url->query_form;
	my %pindex_param = (pindex => $page);
	%params = (%params, %pindex_param);
	$clone_url->query_form(%params);
	my $page_tree = $self->crawler->get($clone_url);
	if(!$self->success && $retry_count < 6) {
	$self->crawler->proxy($self->get_proxy);
	$i--;
	$retry_count++;
	next;
	}
	elsif(!$self->success && $retry_count >=6) {
	return;
	}

	my @page_content;
	eval { @page_content = $page_tree->findvalues('.//div[@id="pres"]//p/descendant-or-self::*/text()'); };
	if(@page_content < 1 ) {
	my ($headline_loc) = $tree->findvalues('.//head/title/text()');
	if(defined $headline_loc && $headline_loc eq $headline && $headline_loc !~ /no transcript data/i) {
	warn "headline should not be No Transcript Data:\n$headline_loc\n";
	}

	if($retry_count < 6) {
	$self->crawler->proxy($self->get_proxy);
	$i--;
	$retry_count++;
	next;
	}
	else {
	warn "Could not get content from page $page returning.\n";
	return if $headline =~ /no transcript data/i;
	}
	}
	@content = (@content, @page_content);
	# Sleep some random amount of time before requesting the next page.
	sleep(int(rand(7)+5));
	}

	@content = (@pre_content, @content);
	$self->update_link_history($job->{link});
	my $dir = $self->base_transcript_dir . '/' . $job->{security_id} . '/';
	my $quarter = $self->determine_quarter($headline);
	my $filename;

	if(defined $quarter && defined $quarter->{quarter} && defined $quarter->{year}) {
	$dir .= $quarter->{year} . '/' . $quarter->{quarter};
	$filename = 'morningstar.txt';
	}
	else {
	my $link = $job->{link};
	$_ = $link;
	s/http\:\/\///;
	s/\//\-\-/g;
	$filename = "$_.txt";
	}

	# Create a new directory structure if one doesn't yet exist.
	if(! -e $dir) {
	system("mkdir -p $dir");
	}

	my $full_file_path = "$dir/$filename";
	print "\n\n\nWRITING TO $full_file_path\n\n\n";
	$self->write_transcript($full_file_path, \@content);
	}


	method crawl {
	my @jobs = @{$self->crawl_jobs};
	if(@jobs < 1) {
	print "No jobs in the queue, sleep for 5 minutes.\n";
	sleep(5*60);
	return;
	}

	my $link_history = $self->link_history;

	foreach my $job (@jobs) {
	$self->crawler->proxy($self->get_proxy);
	$self->crawler->new_user_agent;
	my $mech = $self->crawler->mech;

	if($link_history->{$job->{link}}) {
	warn "Already visited link, skipping and dequeueing job\n";
	$self->dequeue_job($job->{crawl_queue_id});
	next;
	}


	my $custom_headers = {
	'Host' => 'www.morningstar.com',
	'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Language' => 'en-US;en;q=0.5',
	'Accept-Encoding' => 'gzip,deflate',
	'DNT' => '1',
	'Connection' => 'keep-alive',
	'Upgrade-Insecure-Requests' => 1,

	};
	# For some reason HTML::TreeBuilder::XPath strips out <script> tag content which we need
	# So we must manually override Crawler.pm's behavior
	$mech->proxy(['http','https'] => $self->crawler->proxy->to_string);
	my $url = $self->fix_url($job->{link});

	# $req_as_string shows request headers
	print "getting $url with proxy " . $self->crawler->proxy->to_string . "\n";
	my $req_as_string = $mech->get($url, %$custom_headers)
	->request->as_string();

	if(!$mech->success) {
	$self->crawler->proxy->bad;
	$self->crawler->proxy->update(cache => 1);
	next;
	}

	my $resp = $mech->content();
	my $tree = HTML::TreeBuilder::XPath->new_from_content($resp);
	my ($headline) = $tree->findvalues('.//head/title/text()');

	# Try reverting to original link in case it randomly redirects to the "no transcript data" page.
	# Again, major pain to do all this handling manually, but it won't work in Crawler.pm
	# We need the content in <script>here</script> to determine pagination length
	if($headline =~ /no transcript data/i) {
	my $new_proxy = $self->get_proxy;
	$mech->proxy(['http','https'] => $new_proxy->to_string);
	$url = URI->new($job->{link});
	$req_as_string = $mech
	->get($url, %$custom_headers)
	->request->as_string();
	if($mech->success) {
	$resp = $mech->content();
	$tree = HTML::TreeBuilder::XPath->new_from_content($resp);
	($headline) = $tree->findvalues('.//head/title/text()');
	}
	}

	if($self->success && $headline !~ /no transcript data/i) {
	$self->proxy_queue_response_good;
	$self->extract_transcript($tree, $resp, $url, $headline, $job);
	}
	else {
	# Crawler.pm usually takes care of this.
	$self->crawler->proxy->bad;
	$self->crawler->proxy->update(cache => 1);
	}

	}

	$self->write_link_history;
	}

	1;