Skip to content

Instantly share code, notes, and snippets.

@aolney
Created June 29, 2018 19:54
Show Gist options
  • Save aolney/164681d38c6c728945ec4b7a95b3df49 to your computer and use it in GitHub Desktop.
Save aolney/164681d38c6c728945ec4b7a95b3df49 to your computer and use it in GitHub Desktop.
South Park Script Downloader - complements https://github.com/BobAdamsEE/SouthParkData
#r "/z/aolney/repos/HtmlAgilityPack.1.4.9.5/lib/Net40/HtmlAgilityPack.dll"
open HtmlAgilityPack
//This list was generated by going to each season page and getting links with Firefox plugin 'Link Gopher' (does not return links in order)
let urls =
[|
(19,1,"http://southpark.wikia.com/wiki/Stunning_and_Brave/Script")
(19,2,"http://southpark.wikia.com/wiki/Where_My_Country_Gone%3F/Script")
(19,3,"http://southpark.wikia.com/wiki/The_City_Part_of_Town/Script")
(19,4,"http://southpark.wikia.com/wiki/You're_Not_Yelping/Script")
(19,5,"http://southpark.wikia.com/wiki/Safe_Space/Script")
(19,6,"http://southpark.wikia.com/wiki/Tweek_x_Craig/Script")
(19,7,"http://southpark.wikia.com/wiki/Naughty_Ninjas/Script")
(19,8,"http://southpark.wikia.com/wiki/Sponsored_Content/Script")
(19,9,"http://southpark.wikia.com/wiki/Truth_and_Advertising/Script")
(19,10,"http://southpark.wikia.com/wiki/PC_Principal_Final_Justice/Script")
(20,1,"http://southpark.wikia.com/wiki/Member_Berries/Script")
(20,2,"http://southpark.wikia.com/wiki/Skank_Hunt/Script")
(20,3,"http://southpark.wikia.com/wiki/The_Damned/Script")
(20,4,"http://southpark.wikia.com/wiki/Wieners_Out/Script")
(20,5,"http://southpark.wikia.com/wiki/Douche_and_a_Danish/Script")
(20,6,"http://southpark.wikia.com/wiki/Fort_Collins/Script")
(20,7,"http://southpark.wikia.com/wiki/Oh,_Jeez/Script")
(20,8,"http://southpark.wikia.com/wiki/Members_Only/Script")
(20,9,"http://southpark.wikia.com/wiki/Not_Funny/Script")
(20,10,"http://southpark.wikia.com/wiki/The_End_of_Serialization_as_We_Know_It/Script")
|]
//For cleaning bracket markup (nonspeech), whitespace, and colons
let bracketRegex = new System.Text.RegularExpressions.Regex("\[[^\]]+\]")
let CleanText( text : string ) =
bracketRegex.Replace(text,"").Trim([|':';' ';'\n'|])
let scripts =
urls
|> Array.collect(
fun (season,episode,url) ->
System.Console.WriteLine(url)
let web = new HtmlWeb();
let html = web.Load( url )
let table =
match html.DocumentNode.SelectSingleNode("//table[@class='wikitable']") with
//sometimes attribute is missing; choose longest table on page in this case
| null -> html.DocumentNode.SelectNodes("//table") |> Seq.maxBy( fun t -> t.ChildNodes.Count)
| x -> x
let rows = table.Elements("tr")
rows
|> Seq.choose(
fun tr ->
//cells can be th or td; we must have exactly two or this is an invalid row
let cells = tr.ChildNodes |> Seq.filter( fun n -> n.Name = "th" || n.Name = "td" ) |> ResizeArray
if cells.Count = 2 then
let speaker = cells.[0].InnerText |> CleanText
let text = cells.[1].InnerText |> CleanText
//text without speaker is action/setting
if speaker <> "" && text <> "" then
Some(season,episode,speaker,text)
else None
else
None
)
|> Seq.toArray
)
System.IO.File.WriteAllLines(
"sp-scripts-temp.csv",
scripts |> Seq.map( fun (a,b,c,d) -> a.ToString() + "\t" + b.ToString() + "\t" + c.ToString() + "\t" + d.ToString() )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment