Skip to content

Instantly share code, notes, and snippets.

@harej
Created May 26, 2016 09:30
Show Gist options
  • Save harej/f325f881289e221bde32b7215503abff to your computer and use it in GitHub Desktop.
Save harej/f325f881289e221bde32b7215503abff to your computer and use it in GitHub Desktop.
A script that scrapes tools.wmflabs.org/sourcemd
import requests
import time
import csv
from bs4 import BeautifulSoup
def main(sourcefile):
url_template = "https://tools.wmflabs.org/sourcemd/?id={0}&doit=Check+source"
with open(sourcefile) as f:
csvdump = csv.reader(f)
for row in csvdump:
r = requests.get(url_template.format(row[0]))
if r.status_code != 200:
if r.status_code == 503:
time.sleep(300) # wait for five minutes and try again
r = requests.get(url_template.format(row[0]))
if r.status_code != 200:
raise Exception(url_template.format(row[0]) + " returns an error of: " + str(r.status_code))
else:
raise Exception(url_template.format(row[0]) + " returns an error of: " + str(r.status_code))
soup = BeautifulSoup(r.text, "html.parser")
textarea = soup.find_all("textarea")
if len(textarea) == 1:
textarea = textarea[0].contents[0]
print(textarea)
main("doi.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment