Last active
March 28, 2018 02:34
-
-
Save tostka/e08edcf251632b996e6f1d6653ca514e to your computer and use it in GitHub Desktop.
Interactive Imdb title lookup function, uses xml (html) parsing. Returns and lists closest matches in menu, then returns details of selected choice
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<# TRIMMED GIST, SEE FULL Get-IMDBSearch.ps1 SCRIPT FOR DETAILS#> | |
#*------v Function Get-IMDBSearch v------ | |
Function Get-IMDBSearch { | |
<# TRIMMED #> | |
$qryUrlRoot = "http://google.com/search?q=site:imdb.com/title" ; | |
$url = "$($qryUrlRoot) $($Title.trim())" ; | |
$webPage = Invoke-WebRequest -Uri $url ; | |
$pageElems = $webPage.AllElements ; | |
$matchedhits=($pageElems | ?{$_.class -eq "g"}) | select innertext ; | |
if($matchedhits){ | |
if($showDebug){write-verbose -verbose:$true "Processing $($matchedhits.count) matches:"} ; | |
$menu = [ordered]@{} ; | |
$menuentries=0 ; | |
$mnuItems = @() ; | |
foreach ($hit in $matchedhits){ | |
$fields=$hit.innerText.split("`n") ; | |
switch ($fields.Count) { | |
"4" { | |
if($fields[3]){ | |
$Summary = "$($fields[3].substring(0,[System.Math]::Min(50, $fields[3].Length)))..." ; | |
} else { | |
$Summary = "(missing 3rd line)" ; | |
} ; | |
} | |
"5" { | |
if($fields[3]){ | |
$Summary = "$($fields[3].substring(0,[System.Math]::Min(50, $fields[3].Length)))..." ; | |
} else { | |
$Summary = "(no data)" ; | |
} ; | |
} | |
} ; # swtch-E | |
if($fields[1] -match $rgxImdbID ){ | |
$imdbID=$matches[0] ; | |
} else { | |
write-warning "FAILED TO MATCH IMDBid FOR $(($fields | out-string).trim())`nSelection will not be openable without a valid IMDBid!"; | |
$imdbID="-" ; | |
} ; | |
if( $fields[0].tostring().trim() -match "^.*\s-\s(Full\sCast\s&\sCrew|Photo\sGallery|Trivia)\s-\sIMDb$" ){ | |
if($showDebug){Write-Host "Skipped: $($fields[0])..." ; } ; | |
} elseif($fields[0].tostring().trim() -match ".*\s-\s(Trivia|Full\sCast\s&\sCrew|Parents\sGuide|Awards|Plot\sSummary|Company\scredits|FAQ|Plot\skeywords|Photo\sGallery|Taglines|Filming\sLocations|Quotes|News|Synopsis|Soundtracks|External\sReviews|Crazy\sCredits|Connections|TV\sschedule|Release\sInfo|Technical\sSpecifications|Video\sGallery)\s-\sIMDb.*") { | |
if($showDebug){Write-Host "Skipped: $($fields[0])..." ; } ; | |
} else { | |
$props=[ordered]@{ | |
'Title'=$($fields[0].tostring().trim()) ; | |
'imdbID'=$($imdbID) ; | |
'Summary'=$($Summary) ; | |
} ; | |
if(!($conflict = $mnuItems | ?{$_.imdbID -eq $props.imdbid} )){ | |
$omnuEntry = New-Object PSObject -Property $props ; | |
$mnuItems += $omnuEntry | |
} else { | |
if($showDebug){write-verbose -verbose:$true "$($props.Title) ($($props.imdbID))`n$($props.Summary)`n dupes existing entry $(($conflict|out-string).trim())" ;} ; | |
} ; | |
} # if-E; | |
} # loop-E; | |
write-host -ForegroundColor Yellow "Query: '$($Title)'" ; | |
$mnuItem=0 ; | |
foreach ($mnu in $mnuItems){ | |
$mnuItem++ ; | |
$menu.Add($($mnuItem),$($mnu.Title)) ; | |
# output 'visible' menu to console | |
write-host "$($mnuItem). $($mnu.Title),$($mnu.Summary),$($Mnu.imdbID)" ; | |
} ; | |
if($mnuItem -gt 1){ | |
$mnuItem++ ; | |
$mnuExitText= "[Abort & Exit]" ; | |
$menu.Add($($mnuItem),$($mnuExitText)) ; | |
# output 'visible' menu to console | |
write-host "$($mnuItem). $($mnuExitText)" ; | |
[int]$choice = Read-Host 'Enter selection' ; | |
} else { | |
write-host "single-item menu, defaulting" | |
[int]$choice = 1 ; | |
} ; | |
$selection = $menu.Item($choice-1) ; | |
if($selection -eq $mnuExitText){ | |
write-host -ForegroundColor green "Exiting..." ; | |
exit ; | |
} else { | |
$TImdbID = $mnuItems|?{$_.Title -eq $selection} | select -expand imdbID ; | |
} ; | |
if($host.version.major -ge 3){ | |
$moviedata=[ordered]@{Dummy = $null ; } ; | |
} else { | |
$moviedata = New-Object Collections.Specialized.OrderedDictionary ; | |
} ; | |
If($moviedata.Contains("Dummy")){$moviedata.remove("Dummy")} ; | |
# Populate the $moviedata with fields, post creation (can't create [ordered] without members) | |
$hashfields="Type","MpaaRating","Genres","UsrRatingsStmt","UsrRatingsScore","UsrRatingsCount","RuntimeMinutes","Country","Language","Color","Title","Released","Director","Writers","Stars","Description","Storyline","PlotkeywordsKey","imdbID","imdbURL" ; | |
$hashfields |%{$moviedata.Add("$($_)",$($null)) ; } ; | |
# now load the target $TImdbID | |
$url = "http://www.imdb.com/title/$($TImdbID)" ; | |
write-host -foregroundcolor green "Opening selection: '$($selection)'`nimdbID:$($TImdbID) : $($url)..." ; | |
$webPage = Invoke-WebRequest -Uri $url ; | |
$pageElems = $webPage.AllElements ; | |
$SummaryLine=($pageElems | ?{$_.class -eq "subtext"})[0].innertext.split("|").trim() ; | |
$moviedata.Title = (($pageElems | ?{$_.itemprop -eq "name"})[0].innerHTML -split " ")[0] ; | |
switch($SummaryLine.count){ | |
"3" { | |
$moviedata.MPAARating = "-" ; # always a blank rating on a 3count | |
$moviedata.Genres = $($SummaryLine[1].Trim()) ; | |
$matches = $null; | |
If ($SummaryLine[2] -match "^TV\sSeries.*$") { | |
$moviedata.Type = "TV Series" ; | |
$matches = $null ; | |
if($summaryline[2] -match "^TV\sSeries\s\((\d{4}).*"){ | |
try { | |
$moviedata.Released = get-date -Year $matches[1] -month 1 -Day 1 -Format "yyyy" ; | |
} catch { | |
$moviedata.Released = "-" ; | |
} ; | |
} else { | |
$moviedata.Released = "-"; | |
} ; | |
} else { | |
$moviedata.Type = "Movie" ; | |
$matches = $null ; | |
if($summaryline[2] -match ".*(\d{1,2}\s\w*\s\d{4}).*" ) { | |
# lookabehind the (word) and get-date that captured string | |
try { | |
$SummaryLine[2] -match ".*(?=\s\(\w*\))" ; | |
$moviedata.Released = get-date $matches[0] -format "MM/dd/yyyy"; | |
} catch { | |
$moviedata.Released = "-" ; | |
} ; | |
} elseif($summaryline[2] -match "(\d{4})\s\(\w*\)" ) { | |
# lookabehind the (word) and get-date that captured string | |
try { | |
#$moviedata.Released = get-date $matches[1] -format "MM/dd/yyyy"; | |
$moviedata.Released = get-date -Year $matches[1] -month 1 -Day 1 -Format "yyyy" ; | |
} catch { | |
$moviedata.Released = "-" ; | |
} ; | |
} else { $moviedata.Released = "-"; } ; | |
} ; | |
}# swtch-3-E ; | |
"4" { | |
$moviedata.MPAARating = $SummaryLine[0].tostring().trim() ; | |
$moviedata.Genres = $($SummaryLine[2].Trim()) ; | |
$matches = $null; | |
If ($SummaryLine[3] -match "^TV\sSeries.*$") { | |
$moviedata.Type = "TV Series" ; | |
$matches = $null ; | |
if($summaryline[3] -match "^TV\sSeries\s\((\d{4}).*"){ | |
try { | |
$moviedata.Released = get-date $matches[1] -format "MM/dd/yyyy"; | |
} catch { | |
$moviedata.Released = "-" ; | |
} ; | |
} else { | |
$moviedata.Released = "-"; | |
} ; | |
} else { | |
$moviedata.Type = "Movie" ; | |
$matches = $null ; | |
if($summaryline[3] -match ".*(\d{1,2}\s\w*\s\d{4}).*" ) { | |
#lookabehind the (word) and get-date that captured string | |
try { | |
$SummaryLine[3] -match ".*(?=\s\(\w*\))" ; | |
$moviedata.Released = get-date $matches[0] -format "MM/dd/yyyy"; | |
} catch { | |
$moviedata.Released = "-" ; | |
} ; | |
} else { $moviedata.Released = "-"; } ; | |
} # if-E TV/Movie ; | |
} # swtch-4-E ; | |
} ; | |
if(!$moviedata.Released){$moviedata.Released = "-" ; } ; | |
# 12:22 PM 6/4/2017 ratingValue is optional, pre-test for presence | |
if(($pageElems | ?{$_.class -eq "ratingValue"})){ | |
$moviedata.UsrRatingsStmt=($pageElems | ?{$_.class -eq "ratingValue"})[0].innerhtml.split('"')[1].tostring().trim() ; | |
$moviedata.UsrRatingsScore=($moviedata.UsrRatingsStmt -split("\sbased\son\s"))[0].tostring().trim() ; | |
$moviedata.UsrRatingsCount=($moviedata.UsrRatingsStmt -split("\sbased\son\s"))[1].tostring().replace(" user ratings","").trim() ; | |
}else{ | |
$moviedata.UsrRatingsStmt="-" ; | |
$moviedata.UsrRatingsScore="-" ; | |
$moviedata.UsrRatingsCount="-" ; | |
} ; | |
# 1:47 PM 6/4/2017 films freq don't have writers|Dir|Stars | |
$crSum=($pageElems | ?{$_.class -eq "credit_summary_item"})| select innertext ; | |
if($Dir=($crSum|?{$_ -like '*Director:*'}).innerText){ | |
$moviedata.Director= $Dir.tostring().replace("Director: ","").trim() ; | |
} else { | |
$moviedata.Director="-" ; | |
}; | |
if($Writers=($crSum|?{$_ -like '*Writers:*'}).innerText){ | |
# 2:08 PM 6/4/2017 split out : '| 1 more credit' » | |
if($Writers -match ".*\|.*"){ | |
$moviedata.Writers= $Writers.tostring().split("|").trim()[0].replace("Writers: ",""); | |
} else { | |
$moviedata.Writers= $Writers.tostring().trim().replace("Writers: ",""); | |
} ; | |
if($Writers -match "Writers:\s.*\|\s\d{1,2}\smore\scredits.*"){ $moviedata.Writers+="..."} ; | |
} else { | |
$moviedata.Writers="-" ; | |
}; | |
if($Stars=($crSum|?{$_ -like '*Stars:*'}).innerText){ | |
if($stars -match ".*\|.*"){ | |
$moviedata.Stars= $Stars.tostring().split("|").trim()[0].replace("Stars: ","") ; | |
} else { | |
$moviedata.Stars= $Stars.tostring().trim().replace("Stars: ","") ; | |
} ; | |
} else { | |
$moviedata.Stars="-" ; | |
}; | |
# 6:15 PM 9/12/2017 2-step it, some come back with no summary_text | |
$TempResult = $null ; | |
$TempResult = $pageElems | ?{$_.class -eq "summary_text"} ; | |
if($TempResult){ | |
$moviedata.Description = $TempResult[0].innertext.tostring().trim() ; | |
} ; | |
if(($moviedata.Description -match "^Add\sa\sPlot\s.*") -OR (!$moviedata.Description)){ | |
$moviedata.Description = "-" ; | |
} ; | |
if(($pageElems | ?{$_.class -eq "inline canwrap"})){ | |
$moviedata.Storyline = ($pageElems | ?{$_.class -eq "inline canwrap"})[0].innertext.tostring().trim() ; | |
} else { | |
$moviedata.Storyline = "-" ; | |
} ; | |
$TempResult = $null ; | |
$TempResult = $pageElems | ?{$_.class -eq "see-more inline canwrap"} ; | |
#if(($pageElems | ?{$_.class -eq "see-more inline canwrap"})[0].outerText -match "(Plot\sKeywords:\s.*\s)\|\sSee\sAll\s\(\d*\)\s.*" ){ | |
if($TempResult){ | |
if($TempResult[0].outerText -match "(Plot\sKeywords:\s.*\s)\|\sSee\sAll\s\(\d*\)\s.*" ){ | |
$moviedata.PlotkeywordsKey = $matches[1].tostring().trim() ; | |
} ; | |
} ; | |
if(!$moviedata.PlotkeywordsKey){ | |
$moviedata.PlotkeywordsKey = "-" ; | |
} ; | |
$TempResult = $null ; | |
$TempResult = $pageElems | ?{$_.id -eq "titleDetails"} ; | |
if($TempResult){ | |
if(($TempResult)[0].innertext -match ".*(Country:\s.*)"){ | |
# also replace out the pipe with comma | |
$moviedata.Country=($matches[1] -replace "\s\|\s","," -replace("Country: ","")).tostring().trim(); | |
} ; | |
} ; | |
if(!$moviedata.Country){ $moviedata.Country="-" } ; | |
$tempResult = $pageElems | ?{$_.id -eq "titleDetails"} ; | |
if($tempREsult){ | |
if(($pageElems | ?{$_.id -eq "titleDetails"})[0].innertext -match ".*(Language:\s.*)" ){ | |
# 8:07 PM 6/4/2017 replace pipe->comma | |
$moviedata.Language=$matches[1].tostring().replace("Language: ","").trim() -replace "\s\|\s",","; | |
} ; | |
} ; | |
if(!$moviedata.Language){ $moviedata.Language="-" } ; | |
$tempResult= $pageElems | ?{$_.id -eq "titleDetails"} ; | |
if($tempResult){ | |
if(($tempResult)[0].innertext -match ".*(Color:.*)" ){ | |
# 9:58 PM 6/6/2017 sub-out pipe for comma | |
$moviedata.Color=$matches[0].tostring().replace("Color: ","").trim() -replace("\|\s",","); | |
} ; | |
} ; | |
if(!$moviedata.Color){ $moviedata.Color = "-" } ; | |
$moviedata.imdbID=$($TImdbID) ; | |
$moviedata.imdbURL=$($URL) ; | |
if($duration=($pageElems | ?{$_.itemprop -eq "duration"})){ | |
$matches=$null ; | |
switch -regex ($duration[-1].innertext) { | |
"(\d{1,2}h\s\d{1,2}min)" { | |
if($duration[-1].innertext -match "(\d{1,2}h\s\d{1,2}min)"){ | |
$timestamp = $matches[0].tostring().trim().replace(" ","") ; | |
$moviedata.RuntimeMinutes = "$([int]($timestamp.split('h')[0])*60 +[int]($timestamp.split('h')[1]).replace('min',''))min" ; | |
} ; | |
} ; | |
"(\d{1,3}\smin)" { | |
# 1h 27min | |
if($duration[-1].innertext -match "(\d{1,3}\smin)" ){ | |
$moviedata.RuntimeMinutes = $matches[0].tostring().trim().replace(" ","") ; ; | |
} ; | |
} ; | |
default{ $moviedata.RuntimeMinutes = "-" } ; | |
} ; | |
} else { $moviedata.RuntimeMinutes = "-" } ; | |
# dump hash into pipeline (formatting should be handled on receiving end, we just do source data in this func :D) | |
$moviedata | write-output ; | |
} else { | |
write-host "No matches on qry:$($Title)" ; | |
} ; #if-E matchedhits ; | |
} #*------^ END Function Get-IMDBSearch ^------; | |
<# TRIMMED #> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment