Skip to content

Instantly share code, notes, and snippets.

@benfb
Created June 15, 2019 19:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save benfb/0d007e114a9f0074561419800928ae33 to your computer and use it in GitHub Desktop.
Save benfb/0d007e114a9f0074561419800928ae33 to your computer and use it in GitHub Desktop.
A rough draft of a Julia script to create a DataFrame from Fangraphs data
using Gumbo
using Cascadia
using Tables
using DataFrames
using CSV
import HTTP
r = HTTP.request("GET", "https://www.fangraphs.com/leaders.aspx?pos=all&stats=rel&lg=0&qual=10&type=c,-1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299&season=2019&month=0&season1=2019&ind=&team=&rost=&age=&filter=&players=&page=1_100000")
ht = parsehtml(String(r.body))
tbl = eachmatch(Selector("table"),ht.root)[12]
mynames = []
for e in eachmatch(Selector("th"),tbl)
if typeof(children(e)[1]) == HTMLElement{:a}
push!(mynames, children(e)[1][1])
println(children(e)[1][1])
println("")
end
end
namesstring = map(string, mynames)
replace(namesstring[length(namesstring)-1], "%" => "_pct")
# For each row in the stats table
matr2 = reshape([],0,300)
for row in children(eachmatch(Selector("tbody"),tbl)[1])
created_row = Array{Any,1}
for (n, column) in enumerate(children(row))
c = column[1]
if n == 1
continue
end
if typeof(c) != HTMLText
c = c[1]
end
v = try
parse(Float64, string(c))
catch ArgumentError
string(c)
end
created_row = [created_row v]
end
matr2 = [matr2;created_row]
end
symbolnames = map(Symbol, mynames)
tbbl = Tables.table(matr2)
df5 = DataFrame(tbbl)
deletecols!(df5, 1)
names!(df5, symbolnames, makeunique=true)
CSV.write("dataframetest1.csv", df5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment