-
-
Save hesco/7217d168aa20b8e8fa23 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
chrome's inspect element says the css path is: | |
#mn-member-general > div > div.mn-member-basicinfo > div.mn-member-fax | |
and that the xpath is: | |
//*[@id="mn-member-general"]/div/div[3]/div[5] | |
The element I need to parse looks like this: | |
<div itemprop="faxNumber" class="mn-member-fax">(912) 555-1212 (fax)</div> | |
I'm writing a parser which at the moment needs to extract: | |
(912) 555-1212 (fax) from this sample code. | |
I've sorted out how to make many of these work, | |
but remained confused about how to sort out | |
the layers of array references I must dig through | |
to reach through to find the data I am interested in. | |
Can anyone here please provide some guidance on how to do this in a repeatable way? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# My code looks like this: | |
package Foo; | |
use Moose; | |
use Mojo::DOM; | |
use File::Slurp; | |
use Data::Dumper; | |
sub parse_entry { | |
my $self = shift; | |
my %result = (); | |
my $html_file = 'samples/SampleListing.html'; | |
my $html = read_file( $html_file ); | |
my $dom = Mojo::DOM->new( $html ); | |
my @listing = $dom->find( '#mn-member-general' ); | |
my $entry = $dom->find( '#mn-member-general'); | |
# <snip> | |
$result{'fax'} = $self->parse_fax( $entry ); | |
return \%result; | |
} | |
sub parse_fax { # <-- WORK_IN_PROGRESS | |
my $self = shift; | |
my $entry = shift; | |
my $fax; | |
my $fax_entries_raw = $entry->find( 'div.mn-member-basicinfo' ) | |
->find( 'div.mn-member-fax' ) | |
; | |
# ->to_array | |
# ->attr( itemprop => 'faxNumber' ) | |
# ->[0]->[0]; | |
print Dumper( $fax_entries_raw ); | |
return $fax_entries_raw; | |
} | |
sub parse_phones { # <-- UNTESTED, so far just notes | |
my $self = shift; | |
my $entry = shift; | |
# BUG: ignores second phone number. | |
my @phones; | |
my $phone_entries_raw = $entry->find( 'div.mn-member-basicinfo' ) | |
->find( 'div[itemprop=telephone]' ) | |
->each | |
->text->[0]->[0]->[0]; | |
print Dumper( $phone_entries_raw ); | |
return \@phones; | |
} | |
sub parse_phone { # <-- WORKS | |
my $self = shift; | |
my $entry = shift; | |
# BUG: ignores second phone number. | |
my $phone; | |
my $phone_entries_raw = $entry->find( 'div.mn-member-basicinfo' ) | |
->find( 'div.mn-member-phone1' ) | |
->attr( itemprop => 'telephone' ) | |
->text->[0]->[0]->[0]; | |
print Dumper( $phone_entries_raw ); | |
return $phone_entries_raw; | |
} | |
1; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$VAR1 = bless( [ | |
bless( [ | |
bless( [], 'Mojo::Collection' ) | |
], 'Mojo::Collection' ) | |
], 'Mojo::Collection' ); | |
$VAR1 = { | |
'state' => BROKEN, | |
'fax' => bless( [ | |
bless( [ | |
bless( [], 'Mojo::Collection' ) | |
], 'Mojo::Collection' ) | |
], 'Mojo::Collection' ), | |
'city' => 'WORKS', | |
'street_address' => 'WORKS', | |
'email' => 'WORKS', | |
'url' => 'WORKS', | |
'name' => 'WORKS', | |
'phone' => 'WORKS' | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment