Last active
February 4, 2018 17:22
-
-
Save IngoKl/adae170bf7d813c01f40d91f8cdc0c69 to your computer and use it in GitHub Desktop.
A simple Python script that downloads/retrieves all titles based on a given Google Scholar query.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
from bs4 import BeautifulSoup | |
max_results = 17000 | |
search_url = 'https://scholar.google.de/scholar?start=0&q=(%22social+media%22+AND+%22teaching%22)&hl=de&as_sdt=0,5&as_ylo=2017' | |
proxies = {'https': 'socks5://127.0.0.1:8080', 'http': 'socks5://127.0.0.1:8080'} | |
def test_proxy(proxies): | |
result = requests.get('https://api.ipify.org/?format=text') | |
print ('No Proxy: ', result, result.content) | |
result = requests.get('https://api.ipify.org/?format=text', proxies=proxies) | |
print ('Proxy: ', result, result.content) | |
def new_search_url(search_url): | |
current = int(re.search(r'start=([0-9]*)', search_url)[1]) | |
new = re.sub(r'start=[0-9]*', f'start={current + 10}', search_url) | |
return new | |
def get_titles(search_url, proxies=False): | |
if proxies: | |
print ('Using a Proxy') | |
result = requests.get(search_url, proxies=proxies) | |
else: | |
result = requests.get(search_url) | |
soup = BeautifulSoup(result.content, 'html.parser') | |
titles_raw = soup.findAll('h3', {'class': 'gs_rt'}) | |
titles = [] | |
for title in titles_raw: | |
titles.append(title.text.strip()) | |
return titles | |
def write_title(title): | |
# Extremely non-optimized; allows 'hot' extraction of results at any stage | |
f = open('search_results.txt', 'a', encoding='utf8') | |
f.write(title + '\n') | |
f.close() | |
test_proxy(proxies) | |
for i in range(round(max_results / 10)): | |
# For the first run | |
if i != 0: | |
search_url = new_search_url(search_url) | |
#titles = get_titles(search_url, proxies) | |
titles = get_titles(search_url) | |
for title in titles: | |
write_title(title) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment