Skip to content

Instantly share code, notes, and snippets.

View edsu's full-sized avatar

Ed Summers edsu

View GitHub Profile
@edsu
edsu / paradise-lost-domains.txt
Created November 29, 2022 13:09
public_domains "paradise lost" > paradist-lost-domains.txt
cumbrous.elements.earth
vaulted.either.host
without.exteriour.help
creating.derivative.works
reward.though.here
father.without.cloud
project.gutenbergtm.name
unfold.celestial.guide
fierce.strive.here
renowned.alcinous.host
import string
import sys
import requests
import whois
from nltk import tokenize
BOOKFILE = sys.argv[1]
OUTPUTFILE = BOOKFILE + '.possible-domains.txt'
[
{
"url": "https://www.theonion.com/nasa-panics-after-asteroid-fires-back-1849587289",
"status": 200,
"content_type": "text/html; charset=utf-8",
"title": "NASA Panics After Asteroid Fires Back",
"description": "The Onion brings you all of the latest news, stories, photos, videos and more from America's finest news source.",
"image": "https://i.kinja-img.com/gawker-media/image/upload/c_fill,f_auto,fl_progressive,g_center,h_675,pg_1,q_80,w_1200/f0c0e17e9ed6acb0e1bb946a9e233c1d.jpg",
"publisher": "The Onion",
"keywords": [
This file has been truncated, but you can view the full file.
{
"id": 1413,
"creator": "Ed Summers",
"query": [
{
"id": 1413,
"searchId": 1413,
"created": "2022-10-13T21:40:22.102Z",
"value": {
"or": [
#!/usr/bin/env python3
# Use https://github.com/unitedstates/ data to get JSON for current members of
# congress including their social media accounts.
#
# Note: you will need to pip install requests and pyyaml
import json
import yaml
import requests
@edsu
edsu / check.py
Created September 24, 2022 21:54
#!/usr/bin/env python3
#
# This demonstrates an inconsistency in results from the Internet Archive CDX
# API when querying by scopeType=domain vs scopeType=prefix. For context see:
#
# https://inkdroid.org/2022/09/24/pdfs/
#
# Note: you'll need to
#
globals()['y'] = 1
#!/usr/bin/env python3
import json
druids = ['bj330fg0526', 'bp312sd3142', 'bs648dv9357', 'bz893jg7695', 'bz922hc1158', 'cc095kz3027', 'ch908dt6803', 'cp809cz8166', 'cv292vs5727', 'dn752dz0508', 'dy271hk6968', 'fd892fn4310', 'fj109wp2130', 'fn912wb3725', 'fp815hx3553', 'fs415vb1264', 'fv812yp9241', 'fw782ks7983', 'gf100kp6588', 'gj901jn9353', 'hf001pb6273', 'hh929wg3298', 'hn217tx5368', 'hq140wy0905', 'hv642nf7717', 'hv698ks1475', 'hw434pj6642', 'hw645gv7743', 'jb739pj9696', 'jg940ts4575', 'jh597wr5998', 'jz331hr5976', 'kw186hs7975', 'kx196rt8122', 'ky214ft2956', 'ky357nb9554', 'mg249dy7051', 'mk879xr0461', 'mv110pd4781', 'mv300dt6569', 'mx349xb4098', 'mz415jv3453', 'nd087pt9085', 'nk906ht6735', 'nn453zz9250', 'nr015ch1092', 'nv773xq7981', 'pf139tj8228', 'pn628yn6194', 'pq169jd6716', 'px611qw1504', 'qd726vf4177', 'qk039cf4369', 'qw725qm9638', 'qx771bj6775', 'rv306cp2774', 'sd725cc2793', 'sk583gg2589', 'sn506gj4859', 'sq394vr6558', 'sq694nb4696', 'st474bt2800', 'tk364rs5190', 'tw357sy1852', 'tx189sh1771', 't
@edsu
edsu / wacz-images.py
Last active February 19, 2024 03:08
#!/usr/bin/env python3
#
# usage: wacz-images.py <wacz_file>
#
# This program will extract images from the WARC files contained in a WACZ
# file and write them to the current working directory using the image's URL
# as a file location.
#
# You will need to `pip install warcio` for it to work.
# print out the url and title of web pages in a WARC file
import bs4
import sys
from warcio.archiveiterator import ArchiveIterator
warc_file = sys.argv[1]
records = ArchiveIterator(open(warc_file, 'rb'))