Dmitiry Zub☀️ dimitryzub

## researchgate-profile-page.py
# https://serpapi.com/blog/scrape-researchgate-profile-page-in-python/#code-explanation

from parsel import Selector
from playwright.sync_api import sync_playwright
import json, re


def scrape_researchgate_profile(profile: str):
    with sync_playwright() as p:


## researchgate-institution-members.py
from parsel import Selector
from playwright.sync_api import sync_playwright
import re, json, time

def scrape_institution_members(institution: str):
    with sync_playwright() as p:

        institution_memebers = []
        page_num = 1


## google-scholar-papers-from-certain-conference.py
# blog: https://serpapi.com/blog/scrape-google-scholar-papers-within-a-particular-conference-in-python/

from parsel import Selector
import requests, json, os


def check_sources(source: list or str):
    if isinstance(source, str):
        return source                                             # NIPS
    elif isinstance(source, list):

## google-scholar-papers-from-certain-website-serpapi.py
# pip install google-search-results
import os, json
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl

def serpapi_scrape(query: str, website: str):
    params = {
        # https://docs.python.org/3/library/os.html#os.getenv
        "api_key": os.getenv("API_KEY"), # your serpapi API key
        "engine": "google_scholar",      # search engine

## google-scholar-papers-from-certain-website.py
from parsel import Selector
import requests, json, os


def check_websites(website: list or str):
    if isinstance(website, str):
        return website                                           # cabdirect.org
    elif isinstance(website, list):
        return " OR ".join([f'site:{site}' for site in website]) # site:cabdirect.org OR site:cab.com

## scrape_google_scholar_profiles_from_certain_university.py
from parsel import Selector
import requests, re, json

def scrape_all_profiles_from_university(university_name: str):

    # https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
    params = {
        "view_op": "search_authors",  # author results
        "mauthors": university_name,  # search query
        "hl": "en",                   # language

## scrape_google_finance_ticker_python.py
import nasdaqdatalink
import requests, json, re
from parsel import Selector
from itertools import zip_longest


def scrape_google_finance(ticker: str):
    params = {
        "hl": "en" # language
        }

## scrape_naver_video_results_in_python.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                dimitryzub
                / scrape_naver_video_results_in_python.md
            
            
              Last active
              April 4, 2022 11:27
            
              
                Scrape Naver Video Results in Python and SerpApi web-scraping library.
              
          
    What will be scraped


Prerequisites

Basic knowledge scraping with CSS selectors
If you haven't scraped with CSS selectors, there's a dedicated blog post of mine about how to use CSS selectors when web-scraping that covers what it is, pros and cons, and why they're matter from a web-scraping perspective.

  
## brave_search_organic_results.py
from bs4 import BeautifulSoup
import requests, lxml, json

headers = {
    'User-agent':
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

params = {'q': 'dune film', 'source': 'web'}

## duckduckgo_scrape_knowledge_graph.py
from selenium import webdriver

driver = webdriver.Chrome(executable_path='PATH/TO/chromedriver.exe')
# &iax=about - expanded knowledge graph
driver.get('https://duckduckgo.com/?q=elon musk&kl=us-en&ia=web&iax=about')


title = driver.find_element_by_css_selector('.module__title__link').text

try:
	# https://serpapi.com/blog/scrape-researchgate-profile-page-in-python/#code-explanation

	from parsel import Selector
	from playwright.sync_api import sync_playwright
	import json, re


	def scrape_researchgate_profile(profile: str):
	with sync_playwright() as p:
	from parsel import Selector
	from playwright.sync_api import sync_playwright
	import re, json, time

	def scrape_institution_members(institution: str):
	with sync_playwright() as p:

	institution_memebers = []
	page_num = 1
	# blog: https://serpapi.com/blog/scrape-google-scholar-papers-within-a-particular-conference-in-python/

	from parsel import Selector
	import requests, json, os


	def check_sources(source: list or str):
	if isinstance(source, str):
	return source # NIPS
	elif isinstance(source, list):
	# pip install google-search-results
	import os, json
	from serpapi import GoogleSearch
	from urllib.parse import urlsplit, parse_qsl

	def serpapi_scrape(query: str, website: str):
	params = {
	# https://docs.python.org/3/library/os.html#os.getenv
	"api_key": os.getenv("API_KEY"), # your serpapi API key
	"engine": "google_scholar", # search engine
	from parsel import Selector
	import requests, json, os


	def check_websites(website: list or str):
	if isinstance(website, str):
	return website # cabdirect.org
	elif isinstance(website, list):
	return " OR ".join([f'site:{site}' for site in website]) # site:cabdirect.org OR site:cab.com
	from parsel import Selector
	import requests, re, json

	def scrape_all_profiles_from_university(university_name: str):

	# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
	params = {
	"view_op": "search_authors", # author results
	"mauthors": university_name, # search query
	"hl": "en", # language
	import nasdaqdatalink
	import requests, json, re
	from parsel import Selector
	from itertools import zip_longest


	def scrape_google_finance(ticker: str):
	params = {
	"hl": "en" # language
	}
	from bs4 import BeautifulSoup
	import requests, lxml, json

	headers = {
	'User-agent':
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
	}

	params = {'q': 'dune film', 'source': 'web'}
	from selenium import webdriver

	driver = webdriver.Chrome(executable_path='PATH/TO/chromedriver.exe')
	# &iax=about - expanded knowledge graph
	driver.get('https://duckduckgo.com/?q=elon musk&kl=us-en&ia=web&iax=about')


	title = driver.find_element_by_css_selector('.module__title__link').text

	try: