James Hare harej

## niosh_scraper.py
# public domain

from bs4 import BeautifulSoup
import requests

def main():
    manifest = {}

    for id in range(1, 687):  # starting with PGCH #1 and going to #686, the last one
        if id == 553:  # this one is irregular and should be skipped

## npg_gap_analysis.py
# Step 1: Get list of any Wikidata item with NPG ID and anything that is a subclass of chemical hazard
# Step 2: Iterate through each item for invoked items and properties
#         (for claim in claims; for subclaim in claim: 'Q' + str(subclaim['mainsnak']['data-value']['value']['numeric-id'])
#          and subclaim['mainsnak']['property'] where claim[0]['datatype'] == 'wikibase-item')
# Step 3: De-duplicate to generate exhaustive list of each item/property of interest to NIOSH
# Step 4: Check labels: en, es, zh, fr, de
# Step 5: Prepare HTML table that lists each item/property of interest, highlighting cells where values are missing
# Step 6: Take percentages of coverage in each language; save to a timestamped log

import requests

## citoid.py
import requests
import csv
from collections import defaultdict


def get_citation(inputstring):
    r = requests.get("https://citoid.wikimedia.org/api?format=mediawiki&search=" + inputstring)
    return r.json()[0]


## sourcemetadata_scraper.py
import requests
import time
import csv
from bs4 import BeautifulSoup

def main(sourcefile):
    url_template = "https://tools.wmflabs.org/sourcemd/?id={0}&doit=Check+source"

    with open(sourcefile) as f:
        csvdump = csv.reader(f)

## pmcid_to_cites.py
import requests
from bs4 import BeautifulSoup

niosh_mode = False

if niosh_mode == True:
    seed = "https://query.wikidata.org/sparql?format=json&query=SELECT%20%3Fitem%20WHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP932%20%3Fdummy0%20.%0A%20%20%3Fitem%20wdt%3AP859%20wd%3AQ60346%20.%0A%7D"
else:
    seed = "https://query.wikidata.org/sparql?format=json&query=SELECT%20%3Fitem%20WHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP932%20%3Fdummy0%20.%0A%20%20MINUS%20%7B%20%3Fitem%20wdt%3AP859%20wd%3AQ60346%20%7D%0A%7D"

## super_pubmed.py
import html
import requests
import threading

class AskPubMed(threading.Thread):
    def __init__ (self, threadID, name, packages):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.packages = packages

## random_youtube.py
import random
import requests
from time import sleep

while True:
    random_id = ''.join(random.choices('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_', k=11))
    url = 'https://www.youtube.com/watch?v=' + random_id
    r = requests.get(url)
    if r.text.find('This video is unavailable') == -1:
        print(url)

## wdqs_fail.txt
# Timeout
Processing wikidump-000001399.ttl.gz
SPARQL-UPDATE: updateStr=LOAD <file:///srv/mungeOut//wikidump-000001399.ttl.gz>
java.util.concurrent.TimeoutException
        at java.util.concurrent.FutureTask.get(FutureTask.java:205)
        at com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)
        at com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlUpdate(QueryServlet.java:460)
        at com.bigdata.rdf.sail.webapp.QueryServlet.doPost(QueryServlet.java:241)
        at com.bigdata.rdf.sail.webapp.RESTServlet.doPost(RESTServlet.java:269)
        at com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doPost(MultiTenancyServlet.java:195)

## response.txt
jh@Scatter-1 ~ % curl -i http://localhost:63342/Cyberbot_II/IABot/www/setup.php\?_ijt=ek248v577c3ch1l8u1c3mq48gb
HTTP/1.1 200 OK
X-Powered-By: PHP/7.2.28
Set-Cookie: IABotManagementConsole=dbgrl4h62b779fr9luufh5qmm4; expires=Sun, 12-Apr-2020 20:54:31 GMT; Max-Age=2592000; path=/Cyberbot_II/IABot/www
Cache-Control: no-store, must-revalidate
server: PhpStorm 2019.3.3
content-length: 9486
set-cookie: Phpstorm-e21bdce2=b191c1a9-572c-4e8a-b862-7d21cf880eae; Max-Age=315360000; Expires=Mon, 11 Mar 2030 20:54:31 GMT; Path=/; HTTPOnly; SameSite=strict

Set-Cookie: IABotManagementConsole=dbgrl4h62b779fr9luufh5qmm4; expires=Sun, 12-Apr-2020 20:54:31 GMT; Max-Age=2592000; path=/Cyberbot_II/IABot/www

## ingest.py
import requests
import json
from time import sleep
from pprint import pprint

# This is a quick script I came up with for ingesting "munged" Wikidata TTL dumps
# into Amazon Neptune, one at a time, going as fast as possible while respecting
# queue limits.

for i in range(0, 4243):
	# public domain

	from bs4 import BeautifulSoup
	import requests

	def main():
	manifest = {}

	for id in range(1, 687): # starting with PGCH #1 and going to #686, the last one
	if id == 553: # this one is irregular and should be skipped
	# Step 1: Get list of any Wikidata item with NPG ID and anything that is a subclass of chemical hazard
	# Step 2: Iterate through each item for invoked items and properties
	# (for claim in claims; for subclaim in claim: 'Q' + str(subclaim['mainsnak']['data-value']['value']['numeric-id'])
	# and subclaim['mainsnak']['property'] where claim[0]['datatype'] == 'wikibase-item')
	# Step 3: De-duplicate to generate exhaustive list of each item/property of interest to NIOSH
	# Step 4: Check labels: en, es, zh, fr, de
	# Step 5: Prepare HTML table that lists each item/property of interest, highlighting cells where values are missing
	# Step 6: Take percentages of coverage in each language; save to a timestamped log

	import requests
	import requests
	import csv
	from collections import defaultdict


	def get_citation(inputstring):
	r = requests.get("https://citoid.wikimedia.org/api?format=mediawiki&search=" + inputstring)
	return r.json()[0]
	import requests
	import time
	import csv
	from bs4 import BeautifulSoup

	def main(sourcefile):
	url_template = "https://tools.wmflabs.org/sourcemd/?id={0}&doit=Check+source"

	with open(sourcefile) as f:
	csvdump = csv.reader(f)
	import requests
	from bs4 import BeautifulSoup

	niosh_mode = False

	if niosh_mode == True:
	seed = "https://query.wikidata.org/sparql?format=json&query=SELECT%20%3Fitem%20WHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP932%20%3Fdummy0%20.%0A%20%20%3Fitem%20wdt%3AP859%20wd%3AQ60346%20.%0A%7D"
	else:
	seed = "https://query.wikidata.org/sparql?format=json&query=SELECT%20%3Fitem%20WHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP932%20%3Fdummy0%20.%0A%20%20MINUS%20%7B%20%3Fitem%20wdt%3AP859%20wd%3AQ60346%20%7D%0A%7D"
	import html
	import requests
	import threading

	class AskPubMed(threading.Thread):
	def __init__ (self, threadID, name, packages):
	threading.Thread.__init__(self)
	self.threadID = threadID
	self.name = name
	self.packages = packages
	import random
	import requests
	from time import sleep

	while True:
	random_id = ''.join(random.choices('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_', k=11))
	url = 'https://www.youtube.com/watch?v=' + random_id
	r = requests.get(url)
	if r.text.find('This video is unavailable') == -1:
	print(url)
	# Timeout
	Processing wikidump-000001399.ttl.gz
	SPARQL-UPDATE: updateStr=LOAD <file:///srv/mungeOut//wikidump-000001399.ttl.gz>
	java.util.concurrent.TimeoutException
	at java.util.concurrent.FutureTask.get(FutureTask.java:205)
	at com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)
	at com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlUpdate(QueryServlet.java:460)
	at com.bigdata.rdf.sail.webapp.QueryServlet.doPost(QueryServlet.java:241)
	at com.bigdata.rdf.sail.webapp.RESTServlet.doPost(RESTServlet.java:269)
	at com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doPost(MultiTenancyServlet.java:195)
	jh@Scatter-1 ~ % curl -i http://localhost:63342/Cyberbot_II/IABot/www/setup.php\?_ijt=ek248v577c3ch1l8u1c3mq48gb
	HTTP/1.1 200 OK
	X-Powered-By: PHP/7.2.28
	Set-Cookie: IABotManagementConsole=dbgrl4h62b779fr9luufh5qmm4; expires=Sun, 12-Apr-2020 20:54:31 GMT; Max-Age=2592000; path=/Cyberbot_II/IABot/www
	Cache-Control: no-store, must-revalidate
	server: PhpStorm 2019.3.3
	content-length: 9486
	set-cookie: Phpstorm-e21bdce2=b191c1a9-572c-4e8a-b862-7d21cf880eae; Max-Age=315360000; Expires=Mon, 11 Mar 2030 20:54:31 GMT; Path=/; HTTPOnly; SameSite=strict

	Set-Cookie: IABotManagementConsole=dbgrl4h62b779fr9luufh5qmm4; expires=Sun, 12-Apr-2020 20:54:31 GMT; Max-Age=2592000; path=/Cyberbot_II/IABot/www
	import requests
	import json
	from time import sleep
	from pprint import pprint

	# This is a quick script I came up with for ingesting "munged" Wikidata TTL dumps
	# into Amazon Neptune, one at a time, going as fast as possible while respecting
	# queue limits.

	for i in range(0, 4243):