Skip to content

Instantly share code, notes, and snippets.

@noveoko
Last active August 31, 2022 19:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save noveoko/5a1d4ed6325b1adf23faf035ecdcdcee to your computer and use it in GitHub Desktop.
Save noveoko/5a1d4ed6325b1adf23faf035ecdcdcee to your computer and use it in GitHub Desktop.
Fetch Data from Metryki
import requests
import time
list_of_last_names = ['Nowak','Smith']
def get_all_hits(last_name):
url = f"https://geneszukacz.genealodzy.pl/index.php?search_lastname={last_name}&from_date=&to_date=&rpp1=&bdm=&url1=&w=&lang=pol&op=se"
payload={}
headers = {
'authority': 'geneszukacz.genealodzy.pl',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'accept-language': 'en;q=0.8',
'cache-control': 'no-cache',
'pragma': 'no-cache',
'referer': 'https://geneszukacz.genealodzy.pl/index.php?search_lastname=Kraszewski&from_date=&to_date=&rpp1=&bdm=&url1=&w=&lang=pol&op=se',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'sec-gpc': '1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36'
}
response = requests.request("GET", url, headers=headers, data=payload)
if response.status_code == 200:
return response.content
DESTINATION = '/content/people'
for name in all_names:
time.sleep(4)
print(f'Fetching data for: {name}')
result = get_all_hits(name)
if result:
print(f'{name} captured! Size: {len(result)}')
# save content to HTML file
with open(f'{DESTINATION}/{name.lower()}.html','wb') as f:
f.write(result)
else:
raise ValueError(f'No data collected for: {name}')
#download results as zip
!zip -r /content/people.zip /content/people/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment