Skip to content

Instantly share code, notes, and snippets.

View dimitryzub's full-sized avatar
🇺🇦
Grateful

Dmitiry Zub☀️ dimitryzub

🇺🇦
Grateful
View GitHub Profile
@dimitryzub
dimitryzub / researchgate-profile-page.py
Created May 23, 2022 12:28
Web Scraping ResearchGate Profile Page in Python
# https://serpapi.com/blog/scrape-researchgate-profile-page-in-python/#code-explanation
from parsel import Selector
from playwright.sync_api import sync_playwright
import json, re
def scrape_researchgate_profile(profile: str):
with sync_playwright() as p:
@dimitryzub
dimitryzub / researchgate-institution-members.py
Last active May 6, 2022 10:48
Script to scrape all institution members from ResearchGate in Python
from parsel import Selector
from playwright.sync_api import sync_playwright
import re, json, time
def scrape_institution_members(institution: str):
with sync_playwright() as p:
institution_memebers = []
page_num = 1
# blog: https://serpapi.com/blog/scrape-google-scholar-papers-within-a-particular-conference-in-python/
from parsel import Selector
import requests, json, os
def check_sources(source: list or str):
if isinstance(source, str):
return source # NIPS
elif isinstance(source, list):
# pip install google-search-results
import os, json
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
def serpapi_scrape(query: str, website: str):
params = {
# https://docs.python.org/3/library/os.html#os.getenv
"api_key": os.getenv("API_KEY"), # your serpapi API key
"engine": "google_scholar", # search engine
from parsel import Selector
import requests, json, os
def check_websites(website: list or str):
if isinstance(website, str):
return website # cabdirect.org
elif isinstance(website, list):
return " OR ".join([f'site:{site}' for site in website]) # site:cabdirect.org OR site:cab.com
@dimitryzub
dimitryzub / scrape_google_scholar_profiles_from_certain_university.py
Last active April 15, 2022 13:19
Scrapes all profiles from Google Scholar Profiles using pagination.
from parsel import Selector
import requests, re, json
def scrape_all_profiles_from_university(university_name: str):
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"view_op": "search_authors", # author results
"mauthors": university_name, # search query
"hl": "en", # language
@dimitryzub
dimitryzub / scrape_google_finance_ticker_python.py
Last active October 24, 2023 15:14
A script that scrapes Google Finance Ticker in Python - google.com/finance/quote/
import nasdaqdatalink
import requests, json, re
from parsel import Selector
from itertools import zip_longest
def scrape_google_finance(ticker: str):
params = {
"hl": "en" # language
}
@dimitryzub
dimitryzub / scrape_naver_video_results_in_python.md
Last active April 4, 2022 11:27
Scrape Naver Video Results in Python and SerpApi web-scraping library.

What will be scraped

image

Prerequisites

Basic knowledge scraping with CSS selectors

If you haven't scraped with CSS selectors, there's a dedicated blog post of mine about how to use CSS selectors when web-scraping that covers what it is, pros and cons, and why they're matter from a web-scraping perspective.

@dimitryzub
dimitryzub / brave_search_organic_results.py
Created October 26, 2021 08:26
Scrape Brave Search Organic Results using Python
from bs4 import BeautifulSoup
import requests, lxml, json
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {'q': 'dune film', 'source': 'web'}
from selenium import webdriver
driver = webdriver.Chrome(executable_path='PATH/TO/chromedriver.exe')
# &iax=about - expanded knowledge graph
driver.get('https://duckduckgo.com/?q=elon musk&kl=us-en&ia=web&iax=about')
title = driver.find_element_by_css_selector('.module__title__link').text
try: