Last active
November 26, 2022 22:16
-
-
Save thinkofher/bffff1146890cbb420debfc9d6883a20 to your computer and use it in GitHub Desktop.
krs scrapping with go for fun and profit
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module github.com/thinkofher/scrapper-boy | |
go 1.19 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/json" | |
"fmt" | |
"log" | |
"net/http" | |
"os" | |
"strings" | |
"sync" | |
) | |
type scrappedResponse struct { | |
Content string `json:"content"` | |
} | |
func processRequest(wg *sync.WaitGroup, mtx *sync.Mutex, i int, out chan<- scrappedResponse) { | |
defer wg.Done() | |
log.Printf("processing request for page=%d", i) | |
url := fmt.Sprintf("https://ekrs.ms.gov.pl/krsrdf/krs/wyszukiwaniepodmiotu.podmiotygrid.pager/%d/podmiotyGrid", i) | |
body := strings.NewReader("t%3Azoneid=podmiotyGrid&t%3Aformid=form&t%3Aformcomponentid=krs%2FWyszukiwaniePodmiotu%3Aform") | |
req, err := http.NewRequest("POST", url, body) | |
if err != nil { | |
log.Printf("new request error for page=%d: %s", i, err) | |
return | |
} | |
req.Header.Set("Accept", "text/javascript, text/html, application/xml, text/xml, */*") | |
req.Header.Set("Accept-Language", "en-GB,en-US;q=0.9,en;q=0.8,pl;q=0.7") | |
req.Header.Set("Connection", "keep-alive") | |
req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8") | |
req.Header.Set("Cookie", "f5avraaaaaaaaaaaaaaaa_session_=MKGEEFBNHMPFLEGNDLNDLPNANKGIPINKDFHENKOEKJBNFDICBBDMNLFICEIPFNBHCMDDJKFGGMDOLGELCKEALEMEIIIANKDMILLANBNEIKCFFCLNCAHEHJOIMGGNGFGF; f5_cspm=1234; f5avraaaaaaaaaaaaaaaa_session_=PFEPGKBHEODKIKHEDELBABDLGINLCGOBOKIMHFAKNLCMIDEHNFHMNFHIINEBGECOCPLDPHPPIKAINEEKHFCAABFNIIHIAOMOMIFBODIOPGGDKOGBGJKGBKPHLNOAPMGN; JSESSIONID=5ee5906209f178b2302af6dcc191; TS01b5400c=0172db1af95d1d62ddf667d7586ddd14a32d03f6d2f8eb4535c0df9602aa8d25aa59362538be795285259f0ee28d47b9fe72ca3897f617917d2949c728e3083ddb3fa0e99d7d8d03796621b4d20e9e10048cfe3212e4ba270c0a31e8ffb24ae840e72b057845a28d58740d4eca765edbe05b937ea7; f5avraaaaaaaaaaaaaaaa_session_=AKFFBAJBMMEPPKPPKKNBLPFHHDACIGABPCHJHAILJOLNOJPOFHDDODHKCNGLJFGAMDLDBFEJANLJBOKDFENALONOFIJAEPNMOIHHFLPBDCCKHFMBPAGHCFBLFCOIJENH; TSPD_101=08c5699bd4ab280003b45b30c13e083ecaab5279b4a02be9579885f47f11007a84e137801989d13f51325707d0c5d44308de1ea94505180021d987e84940b600e02c46e6e48a73a9abfd764087644c53; TS00000000076=08c5699bd4ab2800167a870283ec0624bbe3106d9915a697097d8abcd4851da25dee738dbcaca64089f0aeec929b491708211a1dd709d0001d1781734bbda870d1d14f0bd3c0a979c7d61570641e4791efc9b5293d8ae4dd4b0496197c82bbf346224253f43ca29ba1cdb3093b69df2cbc6e6116abea749c49b0c619ac6b45fb643f5c557e7dcdaa028397a0c849ee01deedd40a51a7c1742dfb83893b5748afb4e999d3319f9791ed908e194029ff683f44a624fcde56d3a2e23cdf839934812981d4d3c9c07eb507417cdbcdc171b84b0f69d2219dc479426a97238d8c13be8abb4b474fe3681033335f3160199d693deece14c642befeadeefe1eddb0594b30d62d7cd3c0d917; TSPD_101_DID=08c5699bd4ab2800167a870283ec0624bbe3106d9915a697097d8abcd4851da25dee738dbcaca64089f0aeec929b491708211a1dd70638005b3958190db3d4910865942213cc853fdfbb5f6ea82c8ad330e2351bd2fe23f7dacd2696d6117a2ecefea77b914f672af0e9ca0cda99efd1; f5_cspm=1234; TSb3efed9b077=08c5699bd4ab28002d48576e19ac130e0f380d69fce049f9c1c1d5f43565b72e927b6bdffa2119d00816993f514e6329085b0008cd172000309fe97929809047bdb90b14b66dbc46810c65a8bc7a950486ba99124ad8abb6; zeY5MWCa2Sf8xmwF=\\u0021TO1EoTKYPiLVexZDKQxFlDQ1mW49Zaj01kx+crysSU2vwJ/zxM/fDFSfG8M9TtMUkZExXoygzxIJ1Kk=; TS01363499=0172db1af93394ad25321109bf050089a4027ca4b48bf1821bb55eff1ecd57b2b98c0c9d0e91fdc2cca578f2389531774636dcea7aa71eb9c9f11ed29222acb285126ff2c6dd1298f7da1cc9ecee582cce7e6131fb876f4b444628748b7059486b47bf7cb60c2c1627237b7a371f364f5e1e30e3df; TSb3efed9b029=08c5699bd4ab280051ddd4d85d8d4f869ca6520c96183b9f52fe54bb3f1eb3dfd85d991d283c051af76268875fcedfb6; TS364335be027=08c5699bd4ab20007e4da354d8b9c1f64183b5225e617240a442236061c748014cd8bd110a67f1f20845423830113000cec040f3d543c9fd07fc2dca96fc36c9feced188448ea63f992234a622504b0c133196774ef5cbb22f038a4682e2caa8") | |
req.Header.Set("Dnt", "1") | |
req.Header.Set("Origin", "https://ekrs.ms.gov.pl") | |
req.Header.Set("Referer", "https://ekrs.ms.gov.pl/krsrdf/krs/wyszukiwaniepodmiotu?t:lb=t") | |
req.Header.Set("Sec-Fetch-Dest", "empty") | |
req.Header.Set("Sec-Fetch-Mode", "cors") | |
req.Header.Set("Sec-Fetch-Site", "same-origin") | |
req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36") | |
req.Header.Set("X-Prototype-Version", "1.7") | |
req.Header.Set("X-Requested-With", "XMLHttpRequest") | |
req.Header.Set("Sec-Ch-Ua", "\"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"") | |
req.Header.Set("Sec-Ch-Ua-Mobile", "?0") | |
req.Header.Set("Sec-Ch-Ua-Platform", "\"macOS\"") | |
mtx.Lock() | |
resp, err := http.DefaultClient.Do(req) | |
mtx.Unlock() | |
if err != nil { | |
log.Printf("HTTP request error for page=%d: %s", i, err) | |
return | |
} | |
defer resp.Body.Close() | |
var parsed scrappedResponse | |
if err := json.NewDecoder(resp.Body).Decode(&parsed); err != nil { | |
log.Printf("json decoding failure for page=%d: %s", i, err) | |
return | |
} | |
out <- parsed | |
} | |
func glueOutput(in <-chan scrappedResponse, out chan<- []scrappedResponse) { | |
res := []scrappedResponse{} | |
for sr := range in { | |
res = append(res, sr) | |
} | |
out <- res | |
close(out) | |
} | |
func main() { | |
pages := 30 | |
mtx := sync.Mutex{} | |
wg := sync.WaitGroup{} | |
wg.Add(pages) | |
c := make(chan scrappedResponse) | |
cs := make(chan []scrappedResponse) | |
go glueOutput(c, cs) | |
for i := 1; i <= pages; i++ { | |
go processRequest(&wg, &mtx, i, c) | |
} | |
wg.Wait() | |
close(c) | |
f, err := os.Create("out.json") | |
if err != nil { | |
return | |
} | |
defer f.Close() | |
out := <-cs | |
if err := json.NewEncoder(f).Encode(out); err != nil { | |
log.Printf("json encoding error: %s", err) | |
return | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment