Skip to content

Instantly share code, notes, and snippets.

@IngoKl
Last active February 4, 2018 17:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save IngoKl/adae170bf7d813c01f40d91f8cdc0c69 to your computer and use it in GitHub Desktop.
Save IngoKl/adae170bf7d813c01f40d91f8cdc0c69 to your computer and use it in GitHub Desktop.
A simple Python script that downloads/retrieves all titles based on a given Google Scholar query.
import requests
import re
from bs4 import BeautifulSoup
max_results = 17000
search_url = 'https://scholar.google.de/scholar?start=0&q=(%22social+media%22+AND+%22teaching%22)&hl=de&as_sdt=0,5&as_ylo=2017'
proxies = {'https': 'socks5://127.0.0.1:8080', 'http': 'socks5://127.0.0.1:8080'}
def test_proxy(proxies):
result = requests.get('https://api.ipify.org/?format=text')
print ('No Proxy: ', result, result.content)
result = requests.get('https://api.ipify.org/?format=text', proxies=proxies)
print ('Proxy: ', result, result.content)
def new_search_url(search_url):
current = int(re.search(r'start=([0-9]*)', search_url)[1])
new = re.sub(r'start=[0-9]*', f'start={current + 10}', search_url)
return new
def get_titles(search_url, proxies=False):
if proxies:
print ('Using a Proxy')
result = requests.get(search_url, proxies=proxies)
else:
result = requests.get(search_url)
soup = BeautifulSoup(result.content, 'html.parser')
titles_raw = soup.findAll('h3', {'class': 'gs_rt'})
titles = []
for title in titles_raw:
titles.append(title.text.strip())
return titles
def write_title(title):
# Extremely non-optimized; allows 'hot' extraction of results at any stage
f = open('search_results.txt', 'a', encoding='utf8')
f.write(title + '\n')
f.close()
test_proxy(proxies)
for i in range(round(max_results / 10)):
# For the first run
if i != 0:
search_url = new_search_url(search_url)
#titles = get_titles(search_url, proxies)
titles = get_titles(search_url)
for title in titles:
write_title(title)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment