Created
November 26, 2022 23:32
-
-
Save thinkofher/5d2b5e6ba5cd38cd698512a59c76150f to your computer and use it in GitHub Desktop.
krs open api scrapping with go for fun and profit
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module github.com/thinkofher/scrapper-boy | |
go 1.19 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bytes" | |
"encoding/json" | |
"log" | |
"net/http" | |
"os" | |
"sync" | |
) | |
type Payload struct { | |
Registry []string `json:"rejestr"` | |
Subject Subject `json:"podmiot"` | |
Pagination Pagination `json:"paginacja"` | |
} | |
type Subject struct { | |
Voivodeship string `json:"wojewodztwo"` | |
Powiat string `json:"powiat"` | |
Gmina string `json:"gmina"` | |
City string `json:"miejscowosc"` | |
} | |
type Pagination struct { | |
ItemsPerPage int `json:"liczbaElementowNaStronie"` | |
Max int `json:"maksymalnaLiczbaWynikow"` | |
PageNumber int `json:"numerStrony"` | |
} | |
type Response struct { | |
Length int `json:"liczbaPodmiotow"` | |
Subjects []SubjectFromResponse `json:"listaPodmiotow"` | |
} | |
type SubjectFromResponse struct { | |
OPP bool `json:"czyOPP"` | |
Fallen bool `json:"czyUpadlosc"` | |
City string `json:"miejscowosc"` | |
Name string `json:"nazwa"` | |
ID string `json:"numer"` | |
RegistryType string `json:"typRejestru"` | |
} | |
func processRequest(wg *sync.WaitGroup, mtx *sync.Mutex, i int, out chan<- Response) { | |
defer wg.Done() | |
log.Printf("processing request for page=%d", i) | |
data := Payload{ | |
Registry: []string{"P"}, | |
Subject: Subject{ | |
Voivodeship: "ŁÓDZKIE", | |
Powiat: "ŁÓDŹ", | |
Gmina: "ŁÓDŹ", | |
City: "ŁÓDŹ", | |
}, | |
Pagination: Pagination{ | |
ItemsPerPage: 100, | |
Max: 100, | |
PageNumber: i, | |
}, | |
} | |
payloadBytes, err := json.Marshal(data) | |
if err != nil { | |
log.Printf("json marshal error: %s", err) | |
return | |
} | |
body := bytes.NewReader(payloadBytes) | |
req, err := http.NewRequest("POST", "https://prs-openapi2-prs-prod.apps.ocp.prod.ms.gov.pl/api/wyszukiwarka/krs", body) | |
if err != nil { | |
log.Printf("new request error for page=%d: %s", i, err) | |
return | |
} | |
req.Header.Set("Accept", "application/json, text/plain, */*") | |
req.Header.Set("Accept-Language", "en-GB,en-US;q=0.9,en;q=0.8,pl;q=0.7") | |
req.Header.Set("Connection", "keep-alive") | |
req.Header.Set("Content-Type", "application/json") | |
req.Header.Set("Dnt", "1") | |
req.Header.Set("Origin", "https://wyszukiwarka-krs.ms.gov.pl") | |
req.Header.Set("Referer", "https://wyszukiwarka-krs.ms.gov.pl/") | |
req.Header.Set("Sec-Fetch-Dest", "empty") | |
req.Header.Set("Sec-Fetch-Mode", "cors") | |
req.Header.Set("Sec-Fetch-Site", "same-site") | |
req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36") | |
req.Header.Set("Sec-Ch-Ua", "\"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"") | |
req.Header.Set("Sec-Ch-Ua-Mobile", "?0") | |
req.Header.Set("Sec-Ch-Ua-Platform", "\"macOS\"") | |
req.Header.Set("X-Api-Key", "TopSecretApiKey") | |
// mtx.Lock() | |
resp, err := http.DefaultClient.Do(req) | |
// mtx.Unlock() | |
if err != nil { | |
log.Printf("HTTP request error for page=%d: %s", i, err) | |
return | |
} | |
defer resp.Body.Close() | |
var subjects Response | |
if err := json.NewDecoder(resp.Body).Decode(&subjects); err != nil { | |
log.Printf("json decoding failure for page=%d: %s", i, err) | |
return | |
} | |
out <- subjects | |
} | |
func glueOutput(in <-chan Response, out chan<- []SubjectFromResponse) { | |
res := []SubjectFromResponse{} | |
for sr := range in { | |
for _, s := range sr.Subjects { | |
res = append(res, s) | |
} | |
} | |
out <- res | |
close(out) | |
} | |
func main() { | |
pages := 100 | |
mtx := sync.Mutex{} | |
wg := sync.WaitGroup{} | |
wg.Add(pages) | |
c := make(chan Response) | |
cs := make(chan []SubjectFromResponse) | |
go glueOutput(c, cs) | |
for i := 1; i <= pages; i++ { | |
go processRequest(&wg, &mtx, i, c) | |
} | |
wg.Wait() | |
close(c) | |
f, err := os.Create("out.json") | |
if err != nil { | |
return | |
} | |
defer f.Close() | |
out := <-cs | |
if err := json.NewEncoder(f).Encode(out); err != nil { | |
log.Printf("json encoding error: %s", err) | |
return | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment