Skip to content

Instantly share code, notes, and snippets.

@Chirishman
Created November 5, 2018 22:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Chirishman/b9d0bca1cf37e326adcf63d6142dfd44 to your computer and use it in GitHub Desktop.
Save Chirishman/b9d0bca1cf37e326adcf63d6142dfd44 to your computer and use it in GitHub Desktop.
Prototype tool for downloading threadmarked XenForo stories
function Get-ForumStory {
[CmdletBinding()]
Param(
[Parameter(Mandatory)]
[uri[]]$StoryUri,
[Parameter(Mandatory)]
[ValidateSet('text','html')]
[string]$OutputType,
[Parameter()]
[string]$OutputFolder = 'C:\temp\'
)
[uri[]]$CleanedStoryURI = ($StoryUri | %{-join($_.Scheme,'://',$_.Host,-join($_.Segments | select -First 3))})
$StoryObject = [Ordered]@{}
$Suffixes = @{
TOC = 'threadmarks'
Ch = 'reader'
}
$ChapterSelect = @(
@{
N='Title';
E={$_.children[0].children[0].innerText -replace '^Threadmarks\:\ '}
},
@{
N='Id';
E={$_.id}
},
@{
N='ChapterHtml';
E={$_.children[2].children[1].children[0].children[0].innerHTML}
},
@{
N='ChapterText';
E={$_.children[2].children[1].children[0].children[0].innerText}
}
)
$CleanedStoryURI | %{
$StoryBaseURI = $_
$ThisURI = [System.UriBuilder]::new($StoryBaseURI)
$ThisURI.Path += $Suffixes.TOC
$ThisURI = $ThisURI.Uri
$TOC = Invoke-WebRequest -Uri $ThisURI -SessionVariable StorySession
$TOCParsed = $TOC.ParsedHtml
$StoryTitle = (($TOCParsed.title -split '\ \|\ ')[0] -replace '^Threadmarks\ for\:\ ').Trim()
Write-Verbose -Message "Starting - $StoryTitle"
$StoryObject[$StoryTitle] = [System.Collections.ArrayList]::new()
$TitleList = ($TOCparsed.getElementsByTagName('ol') | ?{$_.ClassName -eq 'ThreadmarkCategory_1'} | select -ExpandProperty children) | select -ExpandProperty children | %{$_[0].innerText}
$Pages = [math]::Ceiling(($TitleList | measure | select -ExpandProperty count) / 10)
1..$Pages | %{
$ThisPageURI = [System.UriBuilder]::new($StoryBaseURI)
$ThisPageURI.Path += $Suffixes.Ch
$ThisPageURI.Query = "page=$_"
$ThisPageURI = $ThisPageURI.Uri
Write-Verbose -Message "Querying $ThisPageURI" -Verbose
try {
$Page = Invoke-WebRequest -Uri $ThisPageURI -WebSession $StorySession
$ThisPageChapters = ($Page.ParsedHtml.getElementsByTagName('li') | ?{$_.className -match 'message\ \ \ \ \ \nhasThreadmark ThreadmarkCategory_1'}) | Select $ChapterSelect
$ThisPageChapters | %{
[void]$StoryObject[$StoryTitle].Add($_)
}
} catch {
Write-Error -Message "Could Not Reach $ThisPageURI"
}
start-sleep -Seconds 2
}
}
$StoryObject.GetEnumerator() | %{
if ($OutputType -eq 'text'){
$TxtFileName = -join(($_.Name -replace (([System.Io.Path]::GetInvalidFileNameChars() | %{[regex]::Escape($_)}) -join '|'),'-').Trim(),'.txt')
($_.Value.ChapterText -join "`r`n") | Out-File -LiteralPath (Join-Path -Path $OutputFolder -ChildPath $TxtFileName)
} elseif ($OutputType -eq 'html'){
$HtmlFileName = -join(($_.Name -replace (([System.Io.Path]::GetInvalidFileNameChars() | %{[regex]::Escape($_)}) -join '|'),'-').Trim(),'.htm')
($_.Value.ChapterHtml -join "`r`n") | Out-File -LiteralPath (Join-Path -Path $OutputFolder -ChildPath $HtmlFileName)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment