Skip to content

Instantly share code, notes, and snippets.

@thinkofher
Created November 26, 2022 23:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thinkofher/5d2b5e6ba5cd38cd698512a59c76150f to your computer and use it in GitHub Desktop.
Save thinkofher/5d2b5e6ba5cd38cd698512a59c76150f to your computer and use it in GitHub Desktop.
krs open api scrapping with go for fun and profit
module github.com/thinkofher/scrapper-boy
go 1.19
package main
import (
"bytes"
"encoding/json"
"log"
"net/http"
"os"
"sync"
)
type Payload struct {
Registry []string `json:"rejestr"`
Subject Subject `json:"podmiot"`
Pagination Pagination `json:"paginacja"`
}
type Subject struct {
Voivodeship string `json:"wojewodztwo"`
Powiat string `json:"powiat"`
Gmina string `json:"gmina"`
City string `json:"miejscowosc"`
}
type Pagination struct {
ItemsPerPage int `json:"liczbaElementowNaStronie"`
Max int `json:"maksymalnaLiczbaWynikow"`
PageNumber int `json:"numerStrony"`
}
type Response struct {
Length int `json:"liczbaPodmiotow"`
Subjects []SubjectFromResponse `json:"listaPodmiotow"`
}
type SubjectFromResponse struct {
OPP bool `json:"czyOPP"`
Fallen bool `json:"czyUpadlosc"`
City string `json:"miejscowosc"`
Name string `json:"nazwa"`
ID string `json:"numer"`
RegistryType string `json:"typRejestru"`
}
func processRequest(wg *sync.WaitGroup, mtx *sync.Mutex, i int, out chan<- Response) {
defer wg.Done()
log.Printf("processing request for page=%d", i)
data := Payload{
Registry: []string{"P"},
Subject: Subject{
Voivodeship: "ŁÓDZKIE",
Powiat: "ŁÓDŹ",
Gmina: "ŁÓDŹ",
City: "ŁÓDŹ",
},
Pagination: Pagination{
ItemsPerPage: 100,
Max: 100,
PageNumber: i,
},
}
payloadBytes, err := json.Marshal(data)
if err != nil {
log.Printf("json marshal error: %s", err)
return
}
body := bytes.NewReader(payloadBytes)
req, err := http.NewRequest("POST", "https://prs-openapi2-prs-prod.apps.ocp.prod.ms.gov.pl/api/wyszukiwarka/krs", body)
if err != nil {
log.Printf("new request error for page=%d: %s", i, err)
return
}
req.Header.Set("Accept", "application/json, text/plain, */*")
req.Header.Set("Accept-Language", "en-GB,en-US;q=0.9,en;q=0.8,pl;q=0.7")
req.Header.Set("Connection", "keep-alive")
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Dnt", "1")
req.Header.Set("Origin", "https://wyszukiwarka-krs.ms.gov.pl")
req.Header.Set("Referer", "https://wyszukiwarka-krs.ms.gov.pl/")
req.Header.Set("Sec-Fetch-Dest", "empty")
req.Header.Set("Sec-Fetch-Mode", "cors")
req.Header.Set("Sec-Fetch-Site", "same-site")
req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36")
req.Header.Set("Sec-Ch-Ua", "\"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"")
req.Header.Set("Sec-Ch-Ua-Mobile", "?0")
req.Header.Set("Sec-Ch-Ua-Platform", "\"macOS\"")
req.Header.Set("X-Api-Key", "TopSecretApiKey")
// mtx.Lock()
resp, err := http.DefaultClient.Do(req)
// mtx.Unlock()
if err != nil {
log.Printf("HTTP request error for page=%d: %s", i, err)
return
}
defer resp.Body.Close()
var subjects Response
if err := json.NewDecoder(resp.Body).Decode(&subjects); err != nil {
log.Printf("json decoding failure for page=%d: %s", i, err)
return
}
out <- subjects
}
func glueOutput(in <-chan Response, out chan<- []SubjectFromResponse) {
res := []SubjectFromResponse{}
for sr := range in {
for _, s := range sr.Subjects {
res = append(res, s)
}
}
out <- res
close(out)
}
func main() {
pages := 100
mtx := sync.Mutex{}
wg := sync.WaitGroup{}
wg.Add(pages)
c := make(chan Response)
cs := make(chan []SubjectFromResponse)
go glueOutput(c, cs)
for i := 1; i <= pages; i++ {
go processRequest(&wg, &mtx, i, c)
}
wg.Wait()
close(c)
f, err := os.Create("out.json")
if err != nil {
return
}
defer f.Close()
out := <-cs
if err := json.NewEncoder(f).Encode(out); err != nil {
log.Printf("json encoding error: %s", err)
return
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment