Skip to content

Instantly share code, notes, and snippets.

@thinkofher
Last active November 26, 2022 22:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thinkofher/bffff1146890cbb420debfc9d6883a20 to your computer and use it in GitHub Desktop.
Save thinkofher/bffff1146890cbb420debfc9d6883a20 to your computer and use it in GitHub Desktop.
krs scrapping with go for fun and profit
module github.com/thinkofher/scrapper-boy
go 1.19
package main
import (
"encoding/json"
"fmt"
"log"
"net/http"
"os"
"strings"
"sync"
)
type scrappedResponse struct {
Content string `json:"content"`
}
func processRequest(wg *sync.WaitGroup, mtx *sync.Mutex, i int, out chan<- scrappedResponse) {
defer wg.Done()
log.Printf("processing request for page=%d", i)
url := fmt.Sprintf("https://ekrs.ms.gov.pl/krsrdf/krs/wyszukiwaniepodmiotu.podmiotygrid.pager/%d/podmiotyGrid", i)
body := strings.NewReader("t%3Azoneid=podmiotyGrid&t%3Aformid=form&t%3Aformcomponentid=krs%2FWyszukiwaniePodmiotu%3Aform")
req, err := http.NewRequest("POST", url, body)
if err != nil {
log.Printf("new request error for page=%d: %s", i, err)
return
}
req.Header.Set("Accept", "text/javascript, text/html, application/xml, text/xml, */*")
req.Header.Set("Accept-Language", "en-GB,en-US;q=0.9,en;q=0.8,pl;q=0.7")
req.Header.Set("Connection", "keep-alive")
req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
req.Header.Set("Cookie", "f5avraaaaaaaaaaaaaaaa_session_=MKGEEFBNHMPFLEGNDLNDLPNANKGIPINKDFHENKOEKJBNFDICBBDMNLFICEIPFNBHCMDDJKFGGMDOLGELCKEALEMEIIIANKDMILLANBNEIKCFFCLNCAHEHJOIMGGNGFGF; f5_cspm=1234; f5avraaaaaaaaaaaaaaaa_session_=PFEPGKBHEODKIKHEDELBABDLGINLCGOBOKIMHFAKNLCMIDEHNFHMNFHIINEBGECOCPLDPHPPIKAINEEKHFCAABFNIIHIAOMOMIFBODIOPGGDKOGBGJKGBKPHLNOAPMGN; JSESSIONID=5ee5906209f178b2302af6dcc191; TS01b5400c=0172db1af95d1d62ddf667d7586ddd14a32d03f6d2f8eb4535c0df9602aa8d25aa59362538be795285259f0ee28d47b9fe72ca3897f617917d2949c728e3083ddb3fa0e99d7d8d03796621b4d20e9e10048cfe3212e4ba270c0a31e8ffb24ae840e72b057845a28d58740d4eca765edbe05b937ea7; f5avraaaaaaaaaaaaaaaa_session_=AKFFBAJBMMEPPKPPKKNBLPFHHDACIGABPCHJHAILJOLNOJPOFHDDODHKCNGLJFGAMDLDBFEJANLJBOKDFENALONOFIJAEPNMOIHHFLPBDCCKHFMBPAGHCFBLFCOIJENH; TSPD_101=08c5699bd4ab280003b45b30c13e083ecaab5279b4a02be9579885f47f11007a84e137801989d13f51325707d0c5d44308de1ea94505180021d987e84940b600e02c46e6e48a73a9abfd764087644c53; TS00000000076=08c5699bd4ab2800167a870283ec0624bbe3106d9915a697097d8abcd4851da25dee738dbcaca64089f0aeec929b491708211a1dd709d0001d1781734bbda870d1d14f0bd3c0a979c7d61570641e4791efc9b5293d8ae4dd4b0496197c82bbf346224253f43ca29ba1cdb3093b69df2cbc6e6116abea749c49b0c619ac6b45fb643f5c557e7dcdaa028397a0c849ee01deedd40a51a7c1742dfb83893b5748afb4e999d3319f9791ed908e194029ff683f44a624fcde56d3a2e23cdf839934812981d4d3c9c07eb507417cdbcdc171b84b0f69d2219dc479426a97238d8c13be8abb4b474fe3681033335f3160199d693deece14c642befeadeefe1eddb0594b30d62d7cd3c0d917; TSPD_101_DID=08c5699bd4ab2800167a870283ec0624bbe3106d9915a697097d8abcd4851da25dee738dbcaca64089f0aeec929b491708211a1dd70638005b3958190db3d4910865942213cc853fdfbb5f6ea82c8ad330e2351bd2fe23f7dacd2696d6117a2ecefea77b914f672af0e9ca0cda99efd1; f5_cspm=1234; TSb3efed9b077=08c5699bd4ab28002d48576e19ac130e0f380d69fce049f9c1c1d5f43565b72e927b6bdffa2119d00816993f514e6329085b0008cd172000309fe97929809047bdb90b14b66dbc46810c65a8bc7a950486ba99124ad8abb6; zeY5MWCa2Sf8xmwF=\\u0021TO1EoTKYPiLVexZDKQxFlDQ1mW49Zaj01kx+crysSU2vwJ/zxM/fDFSfG8M9TtMUkZExXoygzxIJ1Kk=; TS01363499=0172db1af93394ad25321109bf050089a4027ca4b48bf1821bb55eff1ecd57b2b98c0c9d0e91fdc2cca578f2389531774636dcea7aa71eb9c9f11ed29222acb285126ff2c6dd1298f7da1cc9ecee582cce7e6131fb876f4b444628748b7059486b47bf7cb60c2c1627237b7a371f364f5e1e30e3df; TSb3efed9b029=08c5699bd4ab280051ddd4d85d8d4f869ca6520c96183b9f52fe54bb3f1eb3dfd85d991d283c051af76268875fcedfb6; TS364335be027=08c5699bd4ab20007e4da354d8b9c1f64183b5225e617240a442236061c748014cd8bd110a67f1f20845423830113000cec040f3d543c9fd07fc2dca96fc36c9feced188448ea63f992234a622504b0c133196774ef5cbb22f038a4682e2caa8")
req.Header.Set("Dnt", "1")
req.Header.Set("Origin", "https://ekrs.ms.gov.pl")
req.Header.Set("Referer", "https://ekrs.ms.gov.pl/krsrdf/krs/wyszukiwaniepodmiotu?t:lb=t")
req.Header.Set("Sec-Fetch-Dest", "empty")
req.Header.Set("Sec-Fetch-Mode", "cors")
req.Header.Set("Sec-Fetch-Site", "same-origin")
req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36")
req.Header.Set("X-Prototype-Version", "1.7")
req.Header.Set("X-Requested-With", "XMLHttpRequest")
req.Header.Set("Sec-Ch-Ua", "\"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"")
req.Header.Set("Sec-Ch-Ua-Mobile", "?0")
req.Header.Set("Sec-Ch-Ua-Platform", "\"macOS\"")
mtx.Lock()
resp, err := http.DefaultClient.Do(req)
mtx.Unlock()
if err != nil {
log.Printf("HTTP request error for page=%d: %s", i, err)
return
}
defer resp.Body.Close()
var parsed scrappedResponse
if err := json.NewDecoder(resp.Body).Decode(&parsed); err != nil {
log.Printf("json decoding failure for page=%d: %s", i, err)
return
}
out <- parsed
}
func glueOutput(in <-chan scrappedResponse, out chan<- []scrappedResponse) {
res := []scrappedResponse{}
for sr := range in {
res = append(res, sr)
}
out <- res
close(out)
}
func main() {
pages := 30
mtx := sync.Mutex{}
wg := sync.WaitGroup{}
wg.Add(pages)
c := make(chan scrappedResponse)
cs := make(chan []scrappedResponse)
go glueOutput(c, cs)
for i := 1; i <= pages; i++ {
go processRequest(&wg, &mtx, i, c)
}
wg.Wait()
close(c)
f, err := os.Create("out.json")
if err != nil {
return
}
defer f.Close()
out := <-cs
if err := json.NewEncoder(f).Encode(out); err != nil {
log.Printf("json encoding error: %s", err)
return
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment