Skip to content

Instantly share code, notes, and snippets.

View harej's full-sized avatar

James Hare harej

View GitHub Profile
@harej
harej / niosh_scraper.py
Last active September 25, 2015 15:21
A script to scrape the Pocket Guide to Chemical Hazards on NIOSH's website
# public domain
from bs4 import BeautifulSoup
import requests
def main():
manifest = {}
for id in range(1, 687): # starting with PGCH #1 and going to #686, the last one
if id == 553: # this one is irregular and should be skipped
@harej
harej / npg_gap_analysis.py
Last active June 9, 2016 21:20
Generates list of items and properties used on NPG-related Wikidata entries and assesses existence of labels in other languages
# Step 1: Get list of any Wikidata item with NPG ID and anything that is a subclass of chemical hazard
# Step 2: Iterate through each item for invoked items and properties
# (for claim in claims; for subclaim in claim: 'Q' + str(subclaim['mainsnak']['data-value']['value']['numeric-id'])
# and subclaim['mainsnak']['property'] where claim[0]['datatype'] == 'wikibase-item')
# Step 3: De-duplicate to generate exhaustive list of each item/property of interest to NIOSH
# Step 4: Check labels: en, es, zh, fr, de
# Step 5: Prepare HTML table that lists each item/property of interest, highlighting cells where values are missing
# Step 6: Take percentages of coverage in each language; save to a timestamped log
import requests
@harej
harej / citoid.py
Created April 22, 2016 19:31
Creates a CSV based on Citoid output
import requests
import csv
from collections import defaultdict
def get_citation(inputstring):
r = requests.get("https://citoid.wikimedia.org/api?format=mediawiki&search=" + inputstring)
return r.json()[0]
@harej
harej / sourcemetadata_scraper.py
Created May 26, 2016 09:30
A script that scrapes tools.wmflabs.org/sourcemd
import requests
import time
import csv
from bs4 import BeautifulSoup
def main(sourcefile):
url_template = "https://tools.wmflabs.org/sourcemd/?id={0}&doit=Check+source"
with open(sourcefile) as f:
csvdump = csv.reader(f)
import requests
from bs4 import BeautifulSoup
niosh_mode = False
if niosh_mode == True:
seed = "https://query.wikidata.org/sparql?format=json&query=SELECT%20%3Fitem%20WHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP932%20%3Fdummy0%20.%0A%20%20%3Fitem%20wdt%3AP859%20wd%3AQ60346%20.%0A%7D"
else:
seed = "https://query.wikidata.org/sparql?format=json&query=SELECT%20%3Fitem%20WHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP932%20%3Fdummy0%20.%0A%20%20MINUS%20%7B%20%3Fitem%20wdt%3AP859%20wd%3AQ60346%20%7D%0A%7D"
import html
import requests
import threading
class AskPubMed(threading.Thread):
def __init__ (self, threadID, name, packages):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.packages = packages
import random
import requests
from time import sleep
while True:
random_id = ''.join(random.choices('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_', k=11))
url = 'https://www.youtube.com/watch?v=' + random_id
r = requests.get(url)
if r.text.find('This video is unavailable') == -1:
print(url)
# Timeout
Processing wikidump-000001399.ttl.gz
SPARQL-UPDATE: updateStr=LOAD <file:///srv/mungeOut//wikidump-000001399.ttl.gz>
java.util.concurrent.TimeoutException
at java.util.concurrent.FutureTask.get(FutureTask.java:205)
at com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)
at com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlUpdate(QueryServlet.java:460)
at com.bigdata.rdf.sail.webapp.QueryServlet.doPost(QueryServlet.java:241)
at com.bigdata.rdf.sail.webapp.RESTServlet.doPost(RESTServlet.java:269)
at com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doPost(MultiTenancyServlet.java:195)
jh@Scatter-1 ~ % curl -i http://localhost:63342/Cyberbot_II/IABot/www/setup.php\?_ijt=ek248v577c3ch1l8u1c3mq48gb
HTTP/1.1 200 OK
X-Powered-By: PHP/7.2.28
Set-Cookie: IABotManagementConsole=dbgrl4h62b779fr9luufh5qmm4; expires=Sun, 12-Apr-2020 20:54:31 GMT; Max-Age=2592000; path=/Cyberbot_II/IABot/www
Cache-Control: no-store, must-revalidate
server: PhpStorm 2019.3.3
content-length: 9486
set-cookie: Phpstorm-e21bdce2=b191c1a9-572c-4e8a-b862-7d21cf880eae; Max-Age=315360000; Expires=Mon, 11 Mar 2030 20:54:31 GMT; Path=/; HTTPOnly; SameSite=strict
Set-Cookie: IABotManagementConsole=dbgrl4h62b779fr9luufh5qmm4; expires=Sun, 12-Apr-2020 20:54:31 GMT; Max-Age=2592000; path=/Cyberbot_II/IABot/www
import requests
import json
from time import sleep
from pprint import pprint
# This is a quick script I came up with for ingesting "munged" Wikidata TTL dumps
# into Amazon Neptune, one at a time, going as fast as possible while respecting
# queue limits.
for i in range(0, 4243):